From 25916d818a7143638695987549186bac06b8ed91 Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Thu, 2 Oct 2025 14:47:09 +0200
Subject: [PATCH 01/10] ldc.intrinsics: Prepare for LLVM 21 final

---
 runtime/druntime/src/ldc/intrinsics.di | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/druntime/src/ldc/intrinsics.di b/runtime/druntime/src/ldc/intrinsics.di
index c0d88d8e2d..e3648ee60b 100644
--- a/runtime/druntime/src/ldc/intrinsics.di
+++ b/runtime/druntime/src/ldc/intrinsics.di
@@ -26,7 +26,7 @@ else version (LDC_LLVM_1800) enum LLVM_version = 1800;
 else version (LDC_LLVM_1801) enum LLVM_version = 1801;
 else version (LDC_LLVM_1901) enum LLVM_version = 1901;
 else version (LDC_LLVM_2001) enum LLVM_version = 2001;
-else version (LDC_LLVM_2100) enum LLVM_version = 2100;
+else version (LDC_LLVM_2101) enum LLVM_version = 2101;
 else static assert(false, "LDC LLVM version not supported");
 
 enum LLVM_atleast(int major) = (LLVM_version >= major * 100);

From 27fcd0181b9afb6d37edbd12db363c322d951e34 Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Thu, 2 Oct 2025 15:36:29 +0200
Subject: [PATCH 02/10] Bump LDC-LLVM to v21.1.5

---
 .github/actions/1-setup/action.yml | 14 +++++++-------
 .github/workflows/main.yml         |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/actions/1-setup/action.yml b/.github/actions/1-setup/action.yml
index a2cd5c9bfb..8f0ee4a8b1 100644
--- a/.github/actions/1-setup/action.yml
+++ b/.github/actions/1-setup/action.yml
@@ -37,30 +37,30 @@ runs:
         # Make sure to link libzstd statically
         sudo rm /usr/lib/$arch-linux-gnu/libzstd.so
 
-    - name: 'Linux: Install clang 20 from apt.llvm.org'
+    - name: 'Linux: Install clang 21 from apt.llvm.org'
       if: runner.os == 'Linux'
       shell: bash
       run: |
         set -eux
         cd ..
         curl -fL --retry 3 --max-time 30 -O https://apt.llvm.org/llvm.sh
-        sudo bash llvm.sh 20
+        sudo bash llvm.sh 21
         for tool in clang clang++ ld.lld; do
-          sudo ln -sf $tool-20 /usr/bin/$tool
+          sudo ln -sf $tool-21 /usr/bin/$tool
           $tool --version
         done
-    - name: 'macOS arm64: Install Homebrew clang 20' # see mimalloc comment in ../3-build-native/action.yml
+    - name: 'macOS arm64: Install Homebrew clang 21' # see mimalloc comment in ../3-build-native/action.yml
       if: runner.os == 'macOS' && inputs.arch == 'arm64'
       shell: bash
-      run: brew install llvm@20
-    - name: 'Windows: Install clang v20.1.3 from GitHub'
+      run: brew install llvm@21
+    - name: 'Windows: Install clang v21.1.5 from GitHub'
       if: runner.os == 'Windows'
       shell: bash
       run: |
         set -eux
         cd ..
         curl -fL --retry 3 --max-time 300 -o clang.exe \
-          https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.3/LLVM-20.1.3-win64.exe
+          https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.5/LLVM-21.1.5-win64.exe
         ./clang.exe //S # double-slash for bash
         rm clang.exe
         # C:\Program Files\LLVM\bin should already be in PATH
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d291ba3d48..4f1f59043b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  LLVM_VERSION: 20.1.5
+  LLVM_VERSION: c922a5f9
 
 jobs:
   build-native:
@@ -85,8 +85,8 @@ jobs:
             os: macos-15
             arch: arm64
             extra_cmake_flags: >-
-              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm@20/bin/clang
-              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@20/bin/clang++
+              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm@21/bin/clang
+              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@21/bin/clang++
               -DD_COMPILER_FLAGS="-O -flto=full -defaultlib=phobos2-ldc-lto,druntime-ldc-lto -L-exported_symbol '-L__*' -L-w"
               -DEXTRA_CXXFLAGS=-flto=full
             with_pgo: true

From 30e68f06ebafcd06100d60f657f297db9813c7ae Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Sat, 4 Oct 2025 13:48:24 +0200
Subject: [PATCH 03/10] [add vendored LLVM 21 tools and utils]

---
 tools/ldc-profdata/llvm-profdata-21.1.cpp     | 3509 +++++++++++++++++
 .../ldc-profgen-21.1/CMakeLists.txt           |   25 +
 .../ldc-profgen-21.1/CSPreInliner.cpp         |  316 ++
 .../ldc-profgen-21.1/CSPreInliner.h           |   96 +
 .../ldc-profgen-21.1/CallContext.h            |   58 +
 .../ldc-profgen-21.1/ErrorHandling.h          |   56 +
 .../ldc-profgen-21.1/MissingFrameInferrer.cpp |  318 ++
 .../ldc-profgen-21.1/MissingFrameInferrer.h   |  116 +
 .../ldc-profgen-21.1/PerfReader.cpp           | 1286 ++++++
 .../ldc-profgen/ldc-profgen-21.1/PerfReader.h |  746 ++++
 .../ldc-profgen-21.1/ProfileGenerator.cpp     | 1371 +++++++
 .../ldc-profgen-21.1/ProfileGenerator.h       |  401 ++
 .../ldc-profgen-21.1/ProfiledBinary.cpp       | 1035 +++++
 .../ldc-profgen-21.1/ProfiledBinary.h         |  620 +++
 .../ldc-profgen-21.1/llvm-profgen.cpp         |  193 +
 utils/FileCheck-21.cpp                        |  879 +++++
 16 files changed, 11025 insertions(+)
 create mode 100644 tools/ldc-profdata/llvm-profdata-21.1.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/CMakeLists.txt
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/CallContext.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/ErrorHandling.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/PerfReader.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/PerfReader.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.cpp
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.h
 create mode 100644 tools/ldc-profgen/ldc-profgen-21.1/llvm-profgen.cpp
 create mode 100644 utils/FileCheck-21.cpp

diff --git a/tools/ldc-profdata/llvm-profdata-21.1.cpp b/tools/ldc-profdata/llvm-profdata-21.1.cpp
new file mode 100644
index 0000000000..0c69d7f786
--- /dev/null
+++ b/tools/ldc-profdata/llvm-profdata-21.1.cpp
@@ -0,0 +1,3509 @@
+//===- llvm-profdata.cpp - LLVM profile data tool -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-profdata merges .profdata files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Debuginfod/HTTPClient.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/ProfileData/DataAccessProf.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ProfileData/MemProf.h"
+#include "llvm/ProfileData/MemProfReader.h"
+#include "llvm/ProfileData/MemProfSummaryBuilder.h"
+#include "llvm/ProfileData/MemProfYAML.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Discriminator.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cmath>
+#include <optional>
+#include <queue>
+
+using namespace llvm;
+using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind;
+
+// https://llvm.org/docs/CommandGuide/llvm-profdata.html has documentations
+// on each subcommand.
+cl::SubCommand ShowSubcommand(
+    "show",
+    "Takes a profile data file and displays the profiles. See detailed "
+    "documentation in "
+    "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-show");
+cl::SubCommand OrderSubcommand(
+    "order",
+    "Reads temporal profiling traces from a profile and outputs a function "
+    "order that reduces the number of page faults for those traces. See "
+    "detailed documentation in "
+    "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-order");
+cl::SubCommand OverlapSubcommand(
+    "overlap",
+    "Computes and displays the overlap between two profiles. See detailed "
+    "documentation in "
+    "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-overlap");
+cl::SubCommand MergeSubcommand(
+    "merge",
+    "Takes several profiles and merge them together. See detailed "
+    "documentation in "
+    "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-merge");
+
+namespace {
+enum ProfileKinds { instr, sample, memory };
+enum FailureMode { warnOnly, failIfAnyAreInvalid, failIfAllAreInvalid };
+
+enum ProfileFormat {
+  PF_None = 0,
+  PF_Text,
+  PF_Compact_Binary, // Deprecated
+  PF_Ext_Binary,
+  PF_GCC,
+  PF_Binary
+};
+
+enum class ShowFormat { Text, Json, Yaml };
+} // namespace
+
+// Common options.
+cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                    cl::init("-"), cl::desc("Output file"),
+                                    cl::sub(ShowSubcommand),
+                                    cl::sub(OrderSubcommand),
+                                    cl::sub(OverlapSubcommand),
+                                    cl::sub(MergeSubcommand));
+// NOTE: cl::alias must not have cl::sub(), since aliased option's cl::sub()
+// will be used. llvm::cl::alias::done() method asserts this condition.
+static cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                                 cl::aliasopt(OutputFilename));
+
+// Options common to at least two commands.
+static cl::opt<ProfileKinds> ProfileKind(
+    cl::desc("Profile kind:"), cl::sub(MergeSubcommand),
+    cl::sub(OverlapSubcommand), cl::init(instr),
+    cl::values(clEnumVal(instr, "Instrumentation profile (default)"),
+               clEnumVal(sample, "Sample profile")));
+static cl::opt<std::string> Filename(cl::Positional,
+                                     cl::desc("<profdata-file>"),
+                                     cl::sub(ShowSubcommand),
+                                     cl::sub(OrderSubcommand));
+static cl::opt<unsigned> MaxDbgCorrelationWarnings(
+    "max-debug-info-correlation-warnings",
+    cl::desc("The maximum number of warnings to emit when correlating "
+             "profile from debug info (0 = no limit)"),
+    cl::sub(MergeSubcommand), cl::sub(ShowSubcommand), cl::init(5));
+static cl::opt<std::string> ProfiledBinary(
+    "profiled-binary", cl::init(""),
+    cl::desc("Path to binary from which the profile was collected."),
+    cl::sub(ShowSubcommand), cl::sub(MergeSubcommand));
+static cl::opt<std::string> DebugInfoFilename(
+    "debug-info", cl::init(""),
+    cl::desc(
+        "For show, read and extract profile metadata from debug info and show "
+        "the functions it found. For merge, use the provided debug info to "
+        "correlate the raw profile."),
+    cl::sub(ShowSubcommand), cl::sub(MergeSubcommand));
+static cl::opt<std::string>
+    BinaryFilename("binary-file", cl::init(""),
+                   cl::desc("For merge, use the provided unstripped binary to "
+                            "correlate the raw profile."),
+                   cl::sub(MergeSubcommand));
+static cl::list<std::string> DebugFileDirectory(
+    "debug-file-directory",
+    cl::desc("Directories to search for object files by build ID"));
+static cl::opt<bool> DebugInfod("debuginfod", cl::init(false), cl::Hidden,
+                                cl::sub(MergeSubcommand),
+                                cl::desc("Enable debuginfod"));
+static cl::opt<ProfCorrelatorKind> BIDFetcherProfileCorrelate(
+    "correlate",
+    cl::desc("Use debug-info or binary correlation to correlate profiles with "
+             "build id fetcher"),
+    cl::init(InstrProfCorrelator::NONE),
+    cl::values(clEnumValN(InstrProfCorrelator::NONE, "",
+                          "No profile correlation"),
+               clEnumValN(InstrProfCorrelator::DEBUG_INFO, "debug-info",
+                          "Use debug info to correlate"),
+               clEnumValN(InstrProfCorrelator::BINARY, "binary",
+                          "Use binary to correlate")));
+static cl::opt<std::string> FuncNameFilter(
+    "function",
+    cl::desc("Only functions matching the filter are shown in the output. For "
+             "overlapping CSSPGO, this takes a function name with calling "
+             "context."),
+    cl::sub(ShowSubcommand), cl::sub(OverlapSubcommand),
+    cl::sub(MergeSubcommand));
+
+// TODO: Consider creating a template class (e.g., MergeOption, ShowOption) to
+// factor out the common cl::sub in cl::opt constructor for subcommand-specific
+// options.
+
+// Options specific to merge subcommand.
+static cl::list<std::string> InputFilenames(cl::Positional,
+                                            cl::sub(MergeSubcommand),
+                                            cl::desc("<filename...>"));
+static cl::list<std::string>
+    WeightedInputFilenames("weighted-input", cl::sub(MergeSubcommand),
+                           cl::desc("<weight>,<filename>"));
+static cl::opt<ProfileFormat> OutputFormat(
+    cl::desc("Format of output profile"), cl::sub(MergeSubcommand),
+    cl::init(PF_Ext_Binary),
+    cl::values(clEnumValN(PF_Binary, "binary", "Binary encoding"),
+               clEnumValN(PF_Ext_Binary, "extbinary",
+                          "Extensible binary encoding "
+                          "(default)"),
+               clEnumValN(PF_Text, "text", "Text encoding"),
+               clEnumValN(PF_GCC, "gcc",
+                          "GCC encoding (only meaningful for -sample)")));
+static cl::opt<std::string>
+    InputFilenamesFile("input-files", cl::init(""), cl::sub(MergeSubcommand),
+                       cl::desc("Path to file containing newline-separated "
+                                "[<weight>,]<filename> entries"));
+static cl::alias InputFilenamesFileA("f", cl::desc("Alias for --input-files"),
+                                     cl::aliasopt(InputFilenamesFile));
+static cl::opt<bool> DumpInputFileList(
+    "dump-input-file-list", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Dump the list of input files and their weights, then exit"));
+static cl::opt<std::string> RemappingFile("remapping-file",
+                                          cl::value_desc("file"),
+                                          cl::sub(MergeSubcommand),
+                                          cl::desc("Symbol remapping file"));
+static cl::alias RemappingFileA("r", cl::desc("Alias for --remapping-file"),
+                                cl::aliasopt(RemappingFile));
+static cl::opt<bool>
+    UseMD5("use-md5", cl::init(false), cl::Hidden,
+           cl::desc("Choose to use MD5 to represent string in name table (only "
+                    "meaningful for -extbinary)"),
+           cl::sub(MergeSubcommand));
+static cl::opt<bool> CompressAllSections(
+    "compress-all-sections", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Compress all sections when writing the profile (only "
+             "meaningful for -extbinary)"));
+static cl::opt<bool> SampleMergeColdContext(
+    "sample-merge-cold-context", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc(
+        "Merge context sample profiles whose count is below cold threshold"));
+static cl::opt<bool> SampleTrimColdContext(
+    "sample-trim-cold-context", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc(
+        "Trim context sample profiles whose count is below cold threshold"));
+static cl::opt<uint32_t> SampleColdContextFrameDepth(
+    "sample-frame-depth-for-cold-context", cl::init(1),
+    cl::sub(MergeSubcommand),
+    cl::desc("Keep the last K frames while merging cold profile. 1 means the "
+             "context-less base profile"));
+static cl::opt<size_t> OutputSizeLimit(
+    "output-size-limit", cl::init(0), cl::Hidden, cl::sub(MergeSubcommand),
+    cl::desc("Trim cold functions until profile size is below specified "
+             "limit in bytes. This uses a heursitic and functions may be "
+             "excessively trimmed"));
+static cl::opt<bool> GenPartialProfile(
+    "gen-partial-profile", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Generate a partial profile (only meaningful for -extbinary)"));
+static cl::opt<bool> SplitLayout(
+    "split-layout", cl::init(false), cl::Hidden, cl::sub(MergeSubcommand),
+    cl::desc("Split the profile to two sections with one containing sample "
+             "profiles with inlined functions and the other without (only "
+             "meaningful for -extbinary)"));
+static cl::opt<std::string> SupplInstrWithSample(
+    "supplement-instr-with-sample", cl::init(""), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Supplement an instr profile with sample profile, to correct "
+             "the profile unrepresentativeness issue. The sample "
+             "profile is the input of the flag. Output will be in instr "
+             "format (The flag only works with -instr)"));
+static cl::opt<float> ZeroCounterThreshold(
+    "zero-counter-threshold", cl::init(0.7), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("For the function which is cold in instr profile but hot in "
+             "sample profile, if the ratio of the number of zero counters "
+             "divided by the total number of counters is above the "
+             "threshold, the profile of the function will be regarded as "
+             "being harmful for performance and will be dropped."));
+static cl::opt<unsigned> SupplMinSizeThreshold(
+    "suppl-min-size-threshold", cl::init(10), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("If the size of a function is smaller than the threshold, "
+             "assume it can be inlined by PGO early inliner and it won't "
+             "be adjusted based on sample profile."));
+static cl::opt<unsigned> InstrProfColdThreshold(
+    "instr-prof-cold-threshold", cl::init(0), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("User specified cold threshold for instr profile which will "
+             "override the cold threshold got from profile summary. "));
+// WARNING: This reservoir size value is propagated to any input indexed
+// profiles for simplicity. Changing this value between invocations could
+// result in sample bias.
+static cl::opt<uint64_t> TemporalProfTraceReservoirSize(
+    "temporal-profile-trace-reservoir-size", cl::init(100),
+    cl::sub(MergeSubcommand),
+    cl::desc("The maximum number of stored temporal profile traces (default: "
+             "100)"));
+static cl::opt<uint64_t> TemporalProfMaxTraceLength(
+    "temporal-profile-max-trace-length", cl::init(10000),
+    cl::sub(MergeSubcommand),
+    cl::desc("The maximum length of a single temporal profile trace "
+             "(default: 10000)"));
+static cl::opt<std::string> FuncNameNegativeFilter(
+    "no-function", cl::init(""), cl::sub(MergeSubcommand),
+    cl::desc("Exclude functions matching the filter from the output."));
+
+static cl::opt<FailureMode>
+    FailMode("failure-mode", cl::init(failIfAnyAreInvalid),
+             cl::desc("Failure mode:"), cl::sub(MergeSubcommand),
+             cl::values(clEnumValN(warnOnly, "warn",
+                                   "Do not fail and just print warnings."),
+                        clEnumValN(failIfAnyAreInvalid, "any",
+                                   "Fail if any profile is invalid."),
+                        clEnumValN(failIfAllAreInvalid, "all",
+                                   "Fail only if all profiles are invalid.")));
+
+static cl::opt<bool> OutputSparse(
+    "sparse", cl::init(false), cl::sub(MergeSubcommand),
+    cl::desc("Generate a sparse profile (only meaningful for -instr)"));
+static cl::opt<unsigned> NumThreads(
+    "num-threads", cl::init(0), cl::sub(MergeSubcommand),
+    cl::desc("Number of merge threads to use (default: autodetect)"));
+static cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
+                             cl::aliasopt(NumThreads));
+
+static cl::opt<std::string> ProfileSymbolListFile(
+    "prof-sym-list", cl::init(""), cl::sub(MergeSubcommand),
+    cl::desc("Path to file containing the list of function symbols "
+             "used to populate profile symbol list"));
+
+static cl::opt<SampleProfileLayout> ProfileLayout(
+    "convert-sample-profile-layout",
+    cl::desc("Convert the generated profile to a profile with a new layout"),
+    cl::sub(MergeSubcommand), cl::init(SPL_None),
+    cl::values(
+        clEnumValN(SPL_Nest, "nest",
+                   "Nested profile, the input should be CS flat profile"),
+        clEnumValN(SPL_Flat, "flat",
+                   "Profile with nested inlinee flatten out")));
+
+static cl::opt<bool> DropProfileSymbolList(
+    "drop-profile-symbol-list", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Drop the profile symbol list when merging AutoFDO profiles "
+             "(only meaningful for -sample)"));
+
+static cl::opt<bool> KeepVTableSymbols(
+    "keep-vtable-symbols", cl::init(false), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("If true, keep the vtable symbols in indexed profiles"));
+
+// Temporary support for writing the previous version of the format, to enable
+// some forward compatibility.
+// TODO: Consider enabling this with future version changes as well, to ease
+// deployment of newer versions of llvm-profdata.
+static cl::opt<bool> DoWritePrevVersion(
+    "write-prev-version", cl::init(false), cl::Hidden,
+    cl::desc("Write the previous version of indexed format, to enable "
+             "some forward compatibility."));
+
+static cl::opt<memprof::IndexedVersion> MemProfVersionRequested(
+    "memprof-version", cl::Hidden, cl::sub(MergeSubcommand),
+    cl::desc("Specify the version of the memprof format to use"),
+    cl::init(memprof::Version3),
+    cl::values(clEnumValN(memprof::Version2, "2", "version 2"),
+               clEnumValN(memprof::Version3, "3", "version 3"),
+               clEnumValN(memprof::Version4, "4", "version 4")));
+
+static cl::opt<bool> MemProfFullSchema(
+    "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand),
+    cl::desc("Use the full schema for serialization"), cl::init(false));
+
+static cl::opt<bool>
+    MemprofGenerateRandomHotness("memprof-random-hotness", cl::init(false),
+                                 cl::Hidden, cl::sub(MergeSubcommand),
+                                 cl::desc("Generate random hotness values"));
+static cl::opt<unsigned> MemprofGenerateRandomHotnessSeed(
+    "memprof-random-hotness-seed", cl::init(0), cl::Hidden,
+    cl::sub(MergeSubcommand),
+    cl::desc("Random hotness seed to use (0 to generate new seed)"));
+
+// Options specific to overlap subcommand.
+static cl::opt<std::string> BaseFilename(cl::Positional, cl::Required,
+                                         cl::desc("<base profile file>"),
+                                         cl::sub(OverlapSubcommand));
+static cl::opt<std::string> TestFilename(cl::Positional, cl::Required,
+                                         cl::desc("<test profile file>"),
+                                         cl::sub(OverlapSubcommand));
+
+static cl::opt<unsigned long long> SimilarityCutoff(
+    "similarity-cutoff", cl::init(0),
+    cl::desc("For sample profiles, list function names (with calling context "
+             "for csspgo) for overlapped functions "
+             "with similarities below the cutoff (percentage times 10000)."),
+    cl::sub(OverlapSubcommand));
+
+static cl::opt<bool> IsCS(
+    "cs", cl::init(false),
+    cl::desc("For context sensitive PGO counts. Does not work with CSSPGO."),
+    cl::sub(OverlapSubcommand));
+
+static cl::opt<unsigned long long> OverlapValueCutoff(
+    "value-cutoff", cl::init(-1),
+    cl::desc(
+        "Function level overlap information for every function (with calling "
+        "context for csspgo) in test "
+        "profile with max count value greater than the parameter value"),
+    cl::sub(OverlapSubcommand));
+
+// Options specific to show subcommand.
+static cl::opt<bool>
+    ShowCounts("counts", cl::init(false),
+               cl::desc("Show counter values for shown functions"),
+               cl::sub(ShowSubcommand));
+static cl::opt<ShowFormat>
+    SFormat("show-format", cl::init(ShowFormat::Text),
+            cl::desc("Emit output in the selected format if supported"),
+            cl::sub(ShowSubcommand),
+            cl::values(clEnumValN(ShowFormat::Text, "text",
+                                  "emit normal text output (default)"),
+                       clEnumValN(ShowFormat::Json, "json", "emit JSON"),
+                       clEnumValN(ShowFormat::Yaml, "yaml", "emit YAML")));
+// TODO: Consider replacing this with `--show-format=text-encoding`.
+static cl::opt<bool>
+    TextFormat("text", cl::init(false),
+               cl::desc("Show instr profile data in text dump format"),
+               cl::sub(ShowSubcommand));
+static cl::opt<bool>
+    JsonFormat("json",
+               cl::desc("Show sample profile data in the JSON format "
+                        "(deprecated, please use --show-format=json)"),
+               cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowIndirectCallTargets(
+    "ic-targets", cl::init(false),
+    cl::desc("Show indirect call site target values for shown functions"),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool>
+    ShowVTables("show-vtables", cl::init(false),
+                cl::desc("Show vtable names for shown functions"),
+                cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowMemOPSizes(
+    "memop-sizes", cl::init(false),
+    cl::desc("Show the profiled sizes of the memory intrinsic calls "
+             "for shown functions"),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool>
+    ShowDetailedSummary("detailed-summary", cl::init(false),
+                        cl::desc("Show detailed profile summary"),
+                        cl::sub(ShowSubcommand));
+static cl::list<uint32_t> DetailedSummaryCutoffs(
+    cl::CommaSeparated, "detailed-summary-cutoffs",
+    cl::desc(
+        "Cutoff percentages (times 10000) for generating detailed summary"),
+    cl::value_desc("800000,901000,999999"), cl::sub(ShowSubcommand));
+static cl::opt<bool>
+    ShowHotFuncList("hot-func-list", cl::init(false),
+                    cl::desc("Show profile summary of a list of hot functions"),
+                    cl::sub(ShowSubcommand));
+static cl::opt<bool>
+    ShowAllFunctions("all-functions", cl::init(false),
+                     cl::desc("Details for each and every function"),
+                     cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowCS("showcs", cl::init(false),
+                            cl::desc("Show context sensitive counts"),
+                            cl::sub(ShowSubcommand));
+static cl::opt<ProfileKinds> ShowProfileKind(
+    cl::desc("Profile kind supported by show:"), cl::sub(ShowSubcommand),
+    cl::init(instr),
+    cl::values(clEnumVal(instr, "Instrumentation profile (default)"),
+               clEnumVal(sample, "Sample profile"),
+               clEnumVal(memory, "MemProf memory access profile")));
+static cl::opt<uint32_t> TopNFunctions(
+    "topn", cl::init(0),
+    cl::desc("Show the list of functions with the largest internal counts"),
+    cl::sub(ShowSubcommand));
+static cl::opt<uint32_t> ShowValueCutoff(
+    "value-cutoff", cl::init(0),
+    cl::desc("Set the count value cutoff. Functions with the maximum count "
+             "less than this value will not be printed out. (Default is 0)"),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool> OnlyListBelow(
+    "list-below-cutoff", cl::init(false),
+    cl::desc("Only output names of functions whose max count values are "
+             "below the cutoff value"),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowProfileSymbolList(
+    "show-prof-sym-list", cl::init(false),
+    cl::desc("Show profile symbol list if it exists in the profile. "),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowSectionInfoOnly(
+    "show-sec-info-only", cl::init(false),
+    cl::desc("Show the information of each section in the sample profile. "
+             "The flag is only usable when the sample profile is in "
+             "extbinary format"),
+    cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowBinaryIds("binary-ids", cl::init(false),
+                                   cl::desc("Show binary ids in the profile. "),
+                                   cl::sub(ShowSubcommand));
+static cl::opt<bool> ShowTemporalProfTraces(
+    "temporal-profile-traces",
+    cl::desc("Show temporal profile traces in the profile."),
+    cl::sub(ShowSubcommand));
+
+static cl::opt<bool>
+    ShowCovered("covered", cl::init(false),
+                cl::desc("Show only the functions that have been executed."),
+                cl::sub(ShowSubcommand));
+
+static cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
+                                        cl::desc("Show profile version. "),
+                                        cl::sub(ShowSubcommand));
+
+// Options specific to order subcommand.
+static cl::opt<unsigned>
+    NumTestTraces("num-test-traces", cl::init(0),
+                  cl::desc("Keep aside the last <num-test-traces> traces in "
+                           "the profile when computing the function order and "
+                           "instead use them to evaluate that order"),
+                  cl::sub(OrderSubcommand));
+
+// We use this string to indicate that there are
+// multiple static functions map to the same name.
+const std::string DuplicateNameStr = "----";
+
+static void warn(Twine Message, StringRef Whence = "", StringRef Hint = "") {
+  WithColor::warning();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+}
+
+static void warn(Error E, StringRef Whence = "") {
+  if (E.isA<InstrProfError>()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      warn(IPE.message(), Whence);
+    });
+  }
+}
+
+static void exitWithError(Twine Message, StringRef Whence = "",
+                          StringRef Hint = "") {
+  WithColor::error();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+  ::exit(1);
+}
+
+static void exitWithError(Error E, StringRef Whence = "") {
+  if (E.isA<InstrProfError>()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      instrprof_error instrError = IPE.get();
+      StringRef Hint = "";
+      if (instrError == instrprof_error::unrecognized_format) {
+        // Hint in case user missed specifying the profile type.
+        Hint = "Perhaps you forgot to use the --sample or --memory option?";
+      }
+      exitWithError(IPE.message(), Whence, Hint);
+    });
+    return;
+  }
+
+  exitWithError(toString(std::move(E)), Whence);
+}
+
+static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
+  exitWithError(EC.message(), Whence);
+}
+
+static void warnOrExitGivenError(FailureMode FailMode, std::error_code EC,
+                                 StringRef Whence = "") {
+  if (FailMode == failIfAnyAreInvalid)
+    exitWithErrorCode(EC, Whence);
+  else
+    warn(EC.message(), Whence);
+}
+
+static void handleMergeWriterError(Error E, StringRef WhenceFile = "",
+                                   StringRef WhenceFunction = "",
+                                   bool ShowHint = true) {
+  if (!WhenceFile.empty())
+    errs() << WhenceFile << ": ";
+  if (!WhenceFunction.empty())
+    errs() << WhenceFunction << ": ";
+
+  auto IPE = instrprof_error::success;
+  E = handleErrors(std::move(E),
+                   [&IPE](std::unique_ptr<InstrProfError> E) -> Error {
+                     IPE = E->get();
+                     return Error(std::move(E));
+                   });
+  errs() << toString(std::move(E)) << "\n";
+
+  if (ShowHint) {
+    StringRef Hint = "";
+    if (IPE != instrprof_error::success) {
+      switch (IPE) {
+      case instrprof_error::hash_mismatch:
+      case instrprof_error::count_mismatch:
+      case instrprof_error::value_site_count_mismatch:
+        Hint = "Make sure that all profile data to be merged is generated "
+               "from the same binary.";
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (!Hint.empty())
+      errs() << Hint << "\n";
+  }
+}
+
+namespace {
+/// A remapper from original symbol names to new symbol names based on a file
+/// containing a list of mappings from old name to new name.
+class SymbolRemapper {
+  std::unique_ptr<MemoryBuffer> File;
+  DenseMap<StringRef, StringRef> RemappingTable;
+
+public:
+  /// Build a SymbolRemapper from a file containing a list of old/new symbols.
+  static std::unique_ptr<SymbolRemapper> create(StringRef InputFile) {
+    auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile);
+    if (!BufOrError)
+      exitWithErrorCode(BufOrError.getError(), InputFile);
+
+    auto Remapper = std::make_unique<SymbolRemapper>();
+    Remapper->File = std::move(BufOrError.get());
+
+    for (line_iterator LineIt(*Remapper->File, /*SkipBlanks=*/true, '#');
+         !LineIt.is_at_eof(); ++LineIt) {
+      std::pair<StringRef, StringRef> Parts = LineIt->split(' ');
+      if (Parts.first.empty() || Parts.second.empty() ||
+          Parts.second.count(' ')) {
+        exitWithError("unexpected line in remapping file",
+                      (InputFile + ":" + Twine(LineIt.line_number())).str(),
+                      "expected 'old_symbol new_symbol'");
+      }
+      Remapper->RemappingTable.insert(Parts);
+    }
+    return Remapper;
+  }
+
+  /// Attempt to map the given old symbol into a new symbol.
+  ///
+  /// \return The new symbol, or \p Name if no such symbol was found.
+  StringRef operator()(StringRef Name) {
+    StringRef New = RemappingTable.lookup(Name);
+    return New.empty() ? Name : New;
+  }
+
+  FunctionId operator()(FunctionId Name) {
+    // MD5 name cannot be remapped.
+    if (!Name.isStringRef())
+      return Name;
+    StringRef New = RemappingTable.lookup(Name.stringRef());
+    return New.empty() ? Name : FunctionId(New);
+  }
+};
+}
+
+struct WeightedFile {
+  std::string Filename;
+  uint64_t Weight;
+};
+typedef SmallVector<WeightedFile, 5> WeightedFileVector;
+
+/// Keep track of merged data and reported errors.
+struct WriterContext {
+  std::mutex Lock;
+  InstrProfWriter Writer;
+  std::vector<std::pair<Error, std::string>> Errors;
+  std::mutex &ErrLock;
+  SmallSet<instrprof_error, 4> &WriterErrorCodes;
+
+  WriterContext(bool IsSparse, std::mutex &ErrLock,
+                SmallSet<instrprof_error, 4> &WriterErrorCodes,
+                uint64_t ReservoirSize = 0, uint64_t MaxTraceLength = 0)
+      : Writer(IsSparse, ReservoirSize, MaxTraceLength, DoWritePrevVersion,
+               MemProfVersionRequested, MemProfFullSchema,
+               MemprofGenerateRandomHotness, MemprofGenerateRandomHotnessSeed),
+        ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
+};
+
+/// Computer the overlap b/w profile BaseFilename and TestFileName,
+/// and store the program level result to Overlap.
+static void overlapInput(const std::string &BaseFilename,
+                         const std::string &TestFilename, WriterContext *WC,
+                         OverlapStats &Overlap,
+                         const OverlapFuncFilters &FuncFilter,
+                         raw_fd_ostream &OS, bool IsCS) {
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = InstrProfReader::create(TestFilename, *FS);
+  if (Error E = ReaderOrErr.takeError()) {
+    // Skip the empty profiles by returning sliently.
+    auto [ErrorCode, Msg] = InstrProfError::take(std::move(E));
+    if (ErrorCode != instrprof_error::empty_raw_profile)
+      WC->Errors.emplace_back(make_error<InstrProfError>(ErrorCode, Msg),
+                              TestFilename);
+    return;
+  }
+
+  auto Reader = std::move(ReaderOrErr.get());
+  for (auto &I : *Reader) {
+    OverlapStats FuncOverlap(OverlapStats::FunctionLevel);
+    FuncOverlap.setFuncInfo(I.Name, I.Hash);
+
+    WC->Writer.overlapRecord(std::move(I), Overlap, FuncOverlap, FuncFilter);
+    FuncOverlap.dump(OS);
+  }
+}
+
+/// Load an input into a writer context.
+static void
+loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
+          const InstrProfCorrelator *Correlator, const StringRef ProfiledBinary,
+          WriterContext *WC, const object::BuildIDFetcher *BIDFetcher = nullptr,
+          const ProfCorrelatorKind *BIDFetcherCorrelatorKind = nullptr) {
+  std::unique_lock<std::mutex> CtxGuard{WC->Lock};
+
+  // Copy the filename, because llvm::ThreadPool copied the input "const
+  // WeightedFile &" by value, making a reference to the filename within it
+  // invalid outside of this packaged task.
+  std::string Filename = Input.Filename;
+
+  using ::llvm::memprof::RawMemProfReader;
+  if (RawMemProfReader::hasFormat(Input.Filename)) {
+    auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary);
+    if (!ReaderOrErr) {
+      exitWithError(ReaderOrErr.takeError(), Input.Filename);
+    }
+    std::unique_ptr<RawMemProfReader> Reader = std::move(ReaderOrErr.get());
+    // Check if the profile types can be merged, e.g. clang frontend profiles
+    // should not be merged with memprof profiles.
+    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+      consumeError(std::move(E));
+      WC->Errors.emplace_back(
+          make_error<StringError>(
+              "Cannot merge MemProf profile with Clang generated profile.",
+              std::error_code()),
+          Filename);
+      return;
+    }
+
+    auto MemProfError = [&](Error E) {
+      auto [ErrorCode, Msg] = InstrProfError::take(std::move(E));
+      WC->Errors.emplace_back(make_error<InstrProfError>(ErrorCode, Msg),
+                              Filename);
+    };
+
+    WC->Writer.addMemProfData(Reader->takeMemProfData(), MemProfError);
+    return;
+  }
+
+  using ::llvm::memprof::YAMLMemProfReader;
+  if (YAMLMemProfReader::hasFormat(Input.Filename)) {
+    auto ReaderOrErr = YAMLMemProfReader::create(Input.Filename);
+    if (!ReaderOrErr)
+      exitWithError(ReaderOrErr.takeError(), Input.Filename);
+    std::unique_ptr<YAMLMemProfReader> Reader = std::move(ReaderOrErr.get());
+    // Check if the profile types can be merged, e.g. clang frontend profiles
+    // should not be merged with memprof profiles.
+    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+      consumeError(std::move(E));
+      WC->Errors.emplace_back(
+          make_error<StringError>(
+              "Cannot merge MemProf profile with incompatible profile.",
+              std::error_code()),
+          Filename);
+      return;
+    }
+
+    auto MemProfError = [&](Error E) {
+      auto [ErrorCode, Msg] = InstrProfError::take(std::move(E));
+      WC->Errors.emplace_back(make_error<InstrProfError>(ErrorCode, Msg),
+                              Filename);
+    };
+
+    auto MemProfData = Reader->takeMemProfData();
+
+    auto DataAccessProfData = Reader->takeDataAccessProfData();
+
+    // Check for the empty input in case the YAML file is invalid.
+    if (MemProfData.Records.empty()) {
+      WC->Errors.emplace_back(
+          make_error<StringError>("The profile is empty.", std::error_code()),
+          Filename);
+    }
+
+    WC->Writer.addMemProfData(std::move(MemProfData), MemProfError);
+    WC->Writer.addDataAccessProfData(std::move(DataAccessProfData));
+    return;
+  }
+
+  auto FS = vfs::getRealFileSystem();
+  // TODO: This only saves the first non-fatal error from InstrProfReader, and
+  // then added to WriterContext::Errors. However, this is not extensible, if
+  // we have more non-fatal errors from InstrProfReader in the future. How
+  // should this interact with different -failure-mode?
+  std::optional<std::pair<Error, std::string>> ReaderWarning;
+  auto Warn = [&](Error E) {
+    if (ReaderWarning) {
+      consumeError(std::move(E));
+      return;
+    }
+    // Only show the first time an error occurs in this file.
+    auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
+    ReaderWarning = {make_error<InstrProfError>(ErrCode, Msg), Filename};
+  };
+
+  const ProfCorrelatorKind CorrelatorKind = BIDFetcherCorrelatorKind
+                                                ? *BIDFetcherCorrelatorKind
+                                                : ProfCorrelatorKind::NONE;
+  auto ReaderOrErr = InstrProfReader::create(Input.Filename, *FS, Correlator,
+                                             BIDFetcher, CorrelatorKind, Warn);
+  if (Error E = ReaderOrErr.takeError()) {
+    // Skip the empty profiles by returning silently.
+    auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
+    if (ErrCode != instrprof_error::empty_raw_profile)
+      WC->Errors.emplace_back(make_error<InstrProfError>(ErrCode, Msg),
+                              Filename);
+    return;
+  }
+
+  auto Reader = std::move(ReaderOrErr.get());
+  if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+    consumeError(std::move(E));
+    WC->Errors.emplace_back(
+        make_error<StringError>(
+            "Merge IR generated profile with Clang generated profile.",
+            std::error_code()),
+        Filename);
+    return;
+  }
+
+  for (auto &I : *Reader) {
+    if (Remapper)
+      I.Name = (*Remapper)(I.Name);
+    const StringRef FuncName = I.Name;
+    bool Reported = false;
+    WC->Writer.addRecord(std::move(I), Input.Weight, [&](Error E) {
+      if (Reported) {
+        consumeError(std::move(E));
+        return;
+      }
+      Reported = true;
+      // Only show hint the first time an error occurs.
+      auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
+      std::unique_lock<std::mutex> ErrGuard{WC->ErrLock};
+      bool firstTime = WC->WriterErrorCodes.insert(ErrCode).second;
+      handleMergeWriterError(make_error<InstrProfError>(ErrCode, Msg),
+                             Input.Filename, FuncName, firstTime);
+    });
+  }
+
+  if (KeepVTableSymbols) {
+    const InstrProfSymtab &symtab = Reader->getSymtab();
+    const auto &VTableNames = symtab.getVTableNames();
+
+    for (const auto &kv : VTableNames)
+      WC->Writer.addVTableName(kv.getKey());
+  }
+
+  if (Reader->hasTemporalProfile()) {
+    auto &Traces = Reader->getTemporalProfTraces(Input.Weight);
+    if (!Traces.empty())
+      WC->Writer.addTemporalProfileTraces(
+          Traces, Reader->getTemporalProfTraceStreamSize());
+  }
+  if (Reader->hasError()) {
+    if (Error E = Reader->getError()) {
+      WC->Errors.emplace_back(std::move(E), Filename);
+      return;
+    }
+  }
+
+  std::vector<llvm::object::BuildID> BinaryIds;
+  if (Error E = Reader->readBinaryIds(BinaryIds)) {
+    WC->Errors.emplace_back(std::move(E), Filename);
+    return;
+  }
+  WC->Writer.addBinaryIds(BinaryIds);
+
+  if (ReaderWarning) {
+    WC->Errors.emplace_back(std::move(ReaderWarning->first),
+                            ReaderWarning->second);
+  }
+}
+
+/// Merge the \p Src writer context into \p Dst.
+static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
+  for (auto &ErrorPair : Src->Errors)
+    Dst->Errors.push_back(std::move(ErrorPair));
+  Src->Errors.clear();
+
+  if (Error E = Dst->Writer.mergeProfileKind(Src->Writer.getProfileKind()))
+    exitWithError(std::move(E));
+
+  Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer), [&](Error E) {
+    auto [ErrorCode, Msg] = InstrProfError::take(std::move(E));
+    std::unique_lock<std::mutex> ErrGuard{Dst->ErrLock};
+    bool firstTime = Dst->WriterErrorCodes.insert(ErrorCode).second;
+    if (firstTime)
+      warn(toString(make_error<InstrProfError>(ErrorCode, Msg)));
+  });
+}
+
+static StringRef
+getFuncName(const StringMap<InstrProfWriter::ProfilingData>::value_type &Val) {
+  return Val.first();
+}
+
+static std::string
+getFuncName(const SampleProfileMap::value_type &Val) {
+  return Val.second.getContext().toString();
+}
+
+template <typename T>
+static void filterFunctions(T &ProfileMap) {
+  bool hasFilter = !FuncNameFilter.empty();
+  bool hasNegativeFilter = !FuncNameNegativeFilter.empty();
+  if (!hasFilter && !hasNegativeFilter)
+    return;
+
+  // If filter starts with '?' it is MSVC mangled name, not a regex.
+  llvm::Regex ProbablyMSVCMangledName("[?@$_0-9A-Za-z]+");
+  if (hasFilter && FuncNameFilter[0] == '?' &&
+      ProbablyMSVCMangledName.match(FuncNameFilter))
+    FuncNameFilter = llvm::Regex::escape(FuncNameFilter);
+  if (hasNegativeFilter && FuncNameNegativeFilter[0] == '?' &&
+      ProbablyMSVCMangledName.match(FuncNameNegativeFilter))
+    FuncNameNegativeFilter = llvm::Regex::escape(FuncNameNegativeFilter);
+
+  size_t Count = ProfileMap.size();
+  llvm::Regex Pattern(FuncNameFilter);
+  llvm::Regex NegativePattern(FuncNameNegativeFilter);
+  std::string Error;
+  if (hasFilter && !Pattern.isValid(Error))
+    exitWithError(Error);
+  if (hasNegativeFilter && !NegativePattern.isValid(Error))
+    exitWithError(Error);
+
+  // Handle MD5 profile, so it is still able to match using the original name.
+  std::string MD5Name = std::to_string(llvm::MD5Hash(FuncNameFilter));
+  std::string NegativeMD5Name =
+      std::to_string(llvm::MD5Hash(FuncNameNegativeFilter));
+
+  for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
+    auto Tmp = I++;
+    const auto &FuncName = getFuncName(*Tmp);
+    // Negative filter has higher precedence than positive filter.
+    if ((hasNegativeFilter &&
+         (NegativePattern.match(FuncName) ||
+          (FunctionSamples::UseMD5 && NegativeMD5Name == FuncName))) ||
+        (hasFilter && !(Pattern.match(FuncName) ||
+                        (FunctionSamples::UseMD5 && MD5Name == FuncName))))
+      ProfileMap.erase(Tmp);
+  }
+
+  llvm::dbgs() << Count - ProfileMap.size() << " of " << Count << " functions "
+               << "in the original profile are filtered.\n";
+}
+
+static void writeInstrProfile(StringRef OutputFilename,
+                              ProfileFormat OutputFormat,
+                              InstrProfWriter &Writer) {
+  std::error_code EC;
+  raw_fd_ostream Output(OutputFilename.data(), EC,
+                        OutputFormat == PF_Text ? sys::fs::OF_TextWithCRLF
+                                                : sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (OutputFormat == PF_Text) {
+    if (Error E = Writer.writeText(Output))
+      warn(std::move(E));
+  } else {
+    if (Output.is_displayed())
+      exitWithError("cannot write a non-text format profile to the terminal");
+    if (Error E = Writer.write(Output))
+      warn(std::move(E));
+  }
+}
+
+static void mergeInstrProfile(const WeightedFileVector &Inputs,
+                              SymbolRemapper *Remapper,
+                              int MaxDbgCorrelationWarnings,
+                              const StringRef ProfiledBinary) {
+  const uint64_t TraceReservoirSize = TemporalProfTraceReservoirSize.getValue();
+  const uint64_t MaxTraceLength = TemporalProfMaxTraceLength.getValue();
+  if (OutputFormat == PF_Compact_Binary)
+    exitWithError("Compact Binary is deprecated");
+  if (OutputFormat != PF_Binary && OutputFormat != PF_Ext_Binary &&
+      OutputFormat != PF_Text)
+    exitWithError("unknown format is specified");
+
+  // TODO: Maybe we should support correlation with mixture of different
+  // correlation modes(w/wo debug-info/object correlation).
+  if (DebugInfoFilename.empty()) {
+    if (!BinaryFilename.empty() && (DebugInfod || !DebugFileDirectory.empty()))
+      exitWithError("Expected only one of -binary-file, -debuginfod or "
+                    "-debug-file-directory");
+  } else if (!BinaryFilename.empty() || DebugInfod ||
+             !DebugFileDirectory.empty()) {
+    exitWithError("Expected only one of -debug-info, -binary-file, -debuginfod "
+                  "or -debug-file-directory");
+  }
+  std::string CorrelateFilename;
+  ProfCorrelatorKind CorrelateKind = ProfCorrelatorKind::NONE;
+  if (!DebugInfoFilename.empty()) {
+    CorrelateFilename = DebugInfoFilename;
+    CorrelateKind = ProfCorrelatorKind::DEBUG_INFO;
+  } else if (!BinaryFilename.empty()) {
+    CorrelateFilename = BinaryFilename;
+    CorrelateKind = ProfCorrelatorKind::BINARY;
+  }
+
+  std::unique_ptr<InstrProfCorrelator> Correlator;
+  if (CorrelateKind != InstrProfCorrelator::NONE) {
+    if (auto Err = InstrProfCorrelator::get(CorrelateFilename, CorrelateKind)
+                       .moveInto(Correlator))
+      exitWithError(std::move(Err), CorrelateFilename);
+    if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings))
+      exitWithError(std::move(Err), CorrelateFilename);
+  }
+
+  ProfCorrelatorKind BIDFetcherCorrelateKind = ProfCorrelatorKind::NONE;
+  std::unique_ptr<object::BuildIDFetcher> BIDFetcher;
+  if (DebugInfod) {
+    llvm::HTTPClient::initialize();
+    BIDFetcher = std::make_unique<DebuginfodFetcher>(DebugFileDirectory);
+    if (!BIDFetcherProfileCorrelate)
+      exitWithError("Expected --correlate when --debuginfod is provided");
+    BIDFetcherCorrelateKind = BIDFetcherProfileCorrelate;
+  } else if (!DebugFileDirectory.empty()) {
+    BIDFetcher = std::make_unique<object::BuildIDFetcher>(DebugFileDirectory);
+    if (!BIDFetcherProfileCorrelate)
+      exitWithError("Expected --correlate when --debug-file-directory "
+                    "is provided");
+    BIDFetcherCorrelateKind = BIDFetcherProfileCorrelate;
+  } else if (BIDFetcherProfileCorrelate) {
+    exitWithError("Expected --debuginfod or --debug-file-directory when "
+                  "--correlate is provided");
+  }
+
+  std::mutex ErrorLock;
+  SmallSet<instrprof_error, 4> WriterErrorCodes;
+
+  // If NumThreads is not specified, auto-detect a good default.
+  if (NumThreads == 0)
+    NumThreads = std::min(hardware_concurrency().compute_thread_count(),
+                          unsigned((Inputs.size() + 1) / 2));
+
+  // Initialize the writer contexts.
+  SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
+  for (unsigned I = 0; I < NumThreads; ++I)
+    Contexts.emplace_back(std::make_unique<WriterContext>(
+        OutputSparse, ErrorLock, WriterErrorCodes, TraceReservoirSize,
+        MaxTraceLength));
+
+  if (NumThreads == 1) {
+    for (const auto &Input : Inputs)
+      loadInput(Input, Remapper, Correlator.get(), ProfiledBinary,
+                Contexts[0].get(), BIDFetcher.get(), &BIDFetcherCorrelateKind);
+  } else {
+    DefaultThreadPool Pool(hardware_concurrency(NumThreads));
+
+    // Load the inputs in parallel (N/NumThreads serial steps).
+    unsigned Ctx = 0;
+    for (const auto &Input : Inputs) {
+      Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary,
+                 Contexts[Ctx].get(), BIDFetcher.get(),
+                 &BIDFetcherCorrelateKind);
+      Ctx = (Ctx + 1) % NumThreads;
+    }
+    Pool.wait();
+
+    // Merge the writer contexts together (~ lg(NumThreads) serial steps).
+    unsigned Mid = Contexts.size() / 2;
+    unsigned End = Contexts.size();
+    assert(Mid > 0 && "Expected more than one context");
+    do {
+      for (unsigned I = 0; I < Mid; ++I)
+        Pool.async(mergeWriterContexts, Contexts[I].get(),
+                   Contexts[I + Mid].get());
+      Pool.wait();
+      if (End & 1) {
+        Pool.async(mergeWriterContexts, Contexts[0].get(),
+                   Contexts[End - 1].get());
+        Pool.wait();
+      }
+      End = Mid;
+      Mid /= 2;
+    } while (Mid > 0);
+  }
+
+  // Handle deferred errors encountered during merging. If the number of errors
+  // is equal to the number of inputs the merge failed.
+  unsigned NumErrors = 0;
+  for (std::unique_ptr<WriterContext> &WC : Contexts) {
+    for (auto &ErrorPair : WC->Errors) {
+      ++NumErrors;
+      warn(toString(std::move(ErrorPair.first)), ErrorPair.second);
+    }
+  }
+  if ((NumErrors == Inputs.size() && FailMode == failIfAllAreInvalid) ||
+      (NumErrors > 0 && FailMode == failIfAnyAreInvalid))
+    exitWithError("no profile can be merged");
+
+  filterFunctions(Contexts[0]->Writer.getProfileData());
+
+  writeInstrProfile(OutputFilename, OutputFormat, Contexts[0]->Writer);
+}
+
+/// The profile entry for a function in instrumentation profile.
+struct InstrProfileEntry {
+  uint64_t MaxCount = 0;
+  uint64_t NumEdgeCounters = 0;
+  float ZeroCounterRatio = 0.0;
+  InstrProfRecord *ProfRecord;
+  InstrProfileEntry(InstrProfRecord *Record);
+  InstrProfileEntry() = default;
+};
+
+InstrProfileEntry::InstrProfileEntry(InstrProfRecord *Record) {
+  ProfRecord = Record;
+  uint64_t CntNum = Record->Counts.size();
+  uint64_t ZeroCntNum = 0;
+  for (size_t I = 0; I < CntNum; ++I) {
+    MaxCount = std::max(MaxCount, Record->Counts[I]);
+    ZeroCntNum += !Record->Counts[I];
+  }
+  ZeroCounterRatio = (float)ZeroCntNum / CntNum;
+  NumEdgeCounters = CntNum;
+}
+
+/// Either set all the counters in the instr profile entry \p IFE to
+/// -1 / -2 /in order to drop the profile or scale up the
+/// counters in \p IFP to be above hot / cold threshold. We use
+/// the ratio of zero counters in the profile of a function to
+/// decide the profile is helpful or harmful for performance,
+/// and to choose whether to scale up or drop it.
+static void updateInstrProfileEntry(InstrProfileEntry &IFE, bool SetToHot,
+                                    uint64_t HotInstrThreshold,
+                                    uint64_t ColdInstrThreshold,
+                                    float ZeroCounterThreshold) {
+  InstrProfRecord *ProfRecord = IFE.ProfRecord;
+  if (!IFE.MaxCount || IFE.ZeroCounterRatio > ZeroCounterThreshold) {
+    // If all or most of the counters of the function are zero, the
+    // profile is unaccountable and should be dropped. Reset all the
+    // counters to be -1 / -2 and PGO profile-use will drop the profile.
+    // All counters being -1 also implies that the function is hot so
+    // PGO profile-use will also set the entry count metadata to be
+    // above hot threshold.
+    // All counters being -2 implies that the function is warm so
+    // PGO profile-use will also set the entry count metadata to be
+    // above cold threshold.
+    auto Kind =
+        (SetToHot ? InstrProfRecord::PseudoHot : InstrProfRecord::PseudoWarm);
+    ProfRecord->setPseudoCount(Kind);
+    return;
+  }
+
+  // Scale up the MaxCount to be multiple times above hot / cold threshold.
+  const unsigned MultiplyFactor = 3;
+  uint64_t Threshold = (SetToHot ? HotInstrThreshold : ColdInstrThreshold);
+  uint64_t Numerator = Threshold * MultiplyFactor;
+
+  // Make sure Threshold for warm counters is below the HotInstrThreshold.
+  if (!SetToHot && Threshold >= HotInstrThreshold) {
+    Threshold = (HotInstrThreshold + ColdInstrThreshold) / 2;
+  }
+
+  uint64_t Denominator = IFE.MaxCount;
+  if (Numerator <= Denominator)
+    return;
+  ProfRecord->scale(Numerator, Denominator, [&](instrprof_error E) {
+    warn(toString(make_error<InstrProfError>(E)));
+  });
+}
+
+const uint64_t ColdPercentileIdx = 15;
+const uint64_t HotPercentileIdx = 11;
+
+using sampleprof::FSDiscriminatorPass;
+
+// Internal options to set FSDiscriminatorPass. Used in merge and show
+// commands.
+static cl::opt<FSDiscriminatorPass> FSDiscriminatorPassOption(
+    "fs-discriminator-pass", cl::init(PassLast), cl::Hidden,
+    cl::desc("Zero out the discriminator bits for the FS discrimiantor "
+             "pass beyond this value. The enum values are defined in "
+             "Support/Discriminator.h"),
+    cl::values(clEnumVal(Base, "Use base discriminators only"),
+               clEnumVal(Pass1, "Use base and pass 1 discriminators"),
+               clEnumVal(Pass2, "Use base and pass 1-2 discriminators"),
+               clEnumVal(Pass3, "Use base and pass 1-3 discriminators"),
+               clEnumVal(PassLast, "Use all discriminator bits (default)")));
+
+static unsigned getDiscriminatorMask() {
+  return getN1Bits(getFSPassBitEnd(FSDiscriminatorPassOption.getValue()));
+}
+
+/// Adjust the instr profile in \p WC based on the sample profile in
+/// \p Reader.
+static void
+adjustInstrProfile(std::unique_ptr<WriterContext> &WC,
+                   std::unique_ptr<sampleprof::SampleProfileReader> &Reader,
+                   unsigned SupplMinSizeThreshold, float ZeroCounterThreshold,
+                   unsigned InstrProfColdThreshold) {
+  // Function to its entry in instr profile.
+  StringMap<InstrProfileEntry> InstrProfileMap;
+  StringMap<StringRef> StaticFuncMap;
+  InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs);
+
+  auto checkSampleProfileHasFUnique = [&Reader]() {
+    for (const auto &PD : Reader->getProfiles()) {
+      auto &FContext = PD.second.getContext();
+      if (FContext.toString().find(FunctionSamples::UniqSuffix) !=
+          std::string::npos) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  bool SampleProfileHasFUnique = checkSampleProfileHasFUnique();
+
+  auto buildStaticFuncMap = [&StaticFuncMap,
+                             SampleProfileHasFUnique](const StringRef Name) {
+    std::string FilePrefixes[] = {".cpp", "cc", ".c", ".hpp", ".h"};
+    size_t PrefixPos = StringRef::npos;
+    for (auto &FilePrefix : FilePrefixes) {
+      std::string NamePrefix = FilePrefix + GlobalIdentifierDelimiter;
+      PrefixPos = Name.find_insensitive(NamePrefix);
+      if (PrefixPos == StringRef::npos)
+        continue;
+      PrefixPos += NamePrefix.size();
+      break;
+    }
+
+    if (PrefixPos == StringRef::npos) {
+      return;
+    }
+
+    StringRef NewName = Name.drop_front(PrefixPos);
+    StringRef FName = Name.substr(0, PrefixPos - 1);
+    if (NewName.size() == 0) {
+      return;
+    }
+
+    // This name should have a static linkage.
+    size_t PostfixPos = NewName.find(FunctionSamples::UniqSuffix);
+    bool ProfileHasFUnique = (PostfixPos != StringRef::npos);
+
+    // If sample profile and instrumented profile do not agree on symbol
+    // uniqification.
+    if (SampleProfileHasFUnique != ProfileHasFUnique) {
+      // If instrumented profile uses -funique-internal-linkage-symbols,
+      // we need to trim the name.
+      if (ProfileHasFUnique) {
+        NewName = NewName.substr(0, PostfixPos);
+      } else {
+        // If sample profile uses -funique-internal-linkage-symbols,
+        // we build the map.
+        std::string NStr =
+            NewName.str() + getUniqueInternalLinkagePostfix(FName);
+        NewName = StringRef(NStr);
+        StaticFuncMap[NewName] = Name;
+        return;
+      }
+    }
+
+    auto [It, Inserted] = StaticFuncMap.try_emplace(NewName, Name);
+    if (!Inserted)
+      It->second = DuplicateNameStr;
+  };
+
+  // We need to flatten the SampleFDO profile as the InstrFDO
+  // profile does not have inlined callsite profiles.
+  // One caveat is the pre-inlined function -- their samples
+  // should be collapsed into the caller function.
+  // Here we do a DFS traversal to get the flatten profile
+  // info: the sum of entrycount and the max of maxcount.
+  // Here is the algorithm:
+  //   recursive (FS, root_name) {
+  //      name = FS->getName();
+  //      get samples for FS;
+  //      if (InstrProf.find(name) {
+  //        root_name = name;
+  //      } else {
+  //        if (name is in static_func map) {
+  //          root_name = static_name;
+  //        }
+  //      }
+  //      update the Map entry for root_name;
+  //      for (subfs: FS) {
+  //        recursive(subfs, root_name);
+  //      }
+  //   }
+  //
+  // Here is an example.
+  //
+  // SampleProfile:
+  // foo:12345:1000
+  // 1: 1000
+  // 2.1: 1000
+  // 15: 5000
+  // 4: bar:1000
+  //  1: 1000
+  //  2: goo:3000
+  //   1: 3000
+  // 8: bar:40000
+  //  1: 10000
+  //  2: goo:30000
+  //   1: 30000
+  //
+  // InstrProfile has two entries:
+  //  foo
+  //  bar.cc;bar
+  //
+  // After BuildMaxSampleMap, we should have the following in FlattenSampleMap:
+  // {"foo", {1000, 5000}}
+  // {"bar.cc;bar", {11000, 30000}}
+  //
+  // foo's has an entry count of 1000, and max body count of 5000.
+  // bar.cc;bar has an entry count of 11000 (sum two callsites of 1000 and
+  // 10000), and max count of 30000 (from the callsite in line 8).
+  //
+  // Note that goo's count will remain in bar.cc;bar() as it does not have an
+  // entry in InstrProfile.
+  llvm::StringMap<std::pair<uint64_t, uint64_t>> FlattenSampleMap;
+  auto BuildMaxSampleMap = [&FlattenSampleMap, &StaticFuncMap,
+                            &InstrProfileMap](const FunctionSamples &FS,
+                                              const StringRef &RootName) {
+    auto BuildMaxSampleMapImpl = [&](const FunctionSamples &FS,
+                                     const StringRef &RootName,
+                                     auto &BuildImpl) -> void {
+      std::string NameStr = FS.getFunction().str();
+      const StringRef Name = NameStr;
+      const StringRef *NewRootName = &RootName;
+      uint64_t EntrySample = FS.getHeadSamplesEstimate();
+      uint64_t MaxBodySample = FS.getMaxCountInside(/* SkipCallSite*/ true);
+
+      auto It = InstrProfileMap.find(Name);
+      if (It != InstrProfileMap.end()) {
+        NewRootName = &Name;
+      } else {
+        auto NewName = StaticFuncMap.find(Name);
+        if (NewName != StaticFuncMap.end()) {
+          It = InstrProfileMap.find(NewName->second);
+          if (NewName->second != DuplicateNameStr) {
+            NewRootName = &NewName->second;
+          }
+        } else {
+          // Here the EntrySample is of an inlined function, so we should not
+          // update the EntrySample in the map.
+          EntrySample = 0;
+        }
+      }
+      EntrySample += FlattenSampleMap[*NewRootName].first;
+      MaxBodySample =
+          std::max(FlattenSampleMap[*NewRootName].second, MaxBodySample);
+      FlattenSampleMap[*NewRootName] =
+          std::make_pair(EntrySample, MaxBodySample);
+
+      for (const auto &C : FS.getCallsiteSamples())
+        for (const auto &F : C.second)
+          BuildImpl(F.second, *NewRootName, BuildImpl);
+    };
+    BuildMaxSampleMapImpl(FS, RootName, BuildMaxSampleMapImpl);
+  };
+
+  for (auto &PD : WC->Writer.getProfileData()) {
+    // Populate IPBuilder.
+    for (const auto &PDV : PD.getValue()) {
+      InstrProfRecord Record = PDV.second;
+      IPBuilder.addRecord(Record);
+    }
+
+    // If a function has multiple entries in instr profile, skip it.
+    if (PD.getValue().size() != 1)
+      continue;
+
+    // Initialize InstrProfileMap.
+    InstrProfRecord *R = &PD.getValue().begin()->second;
+    StringRef FullName = PD.getKey();
+    InstrProfileMap[FullName] = InstrProfileEntry(R);
+    buildStaticFuncMap(FullName);
+  }
+
+  for (auto &PD : Reader->getProfiles()) {
+    sampleprof::FunctionSamples &FS = PD.second;
+    std::string Name = FS.getFunction().str();
+    BuildMaxSampleMap(FS, Name);
+  }
+
+  ProfileSummary InstrPS = *IPBuilder.getSummary();
+  ProfileSummary SamplePS = Reader->getSummary();
+
+  // Compute cold thresholds for instr profile and sample profile.
+  uint64_t HotSampleThreshold =
+      ProfileSummaryBuilder::getEntryForPercentile(
+          SamplePS.getDetailedSummary(),
+          ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx])
+          .MinCount;
+  uint64_t ColdSampleThreshold =
+      ProfileSummaryBuilder::getEntryForPercentile(
+          SamplePS.getDetailedSummary(),
+          ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx])
+          .MinCount;
+  uint64_t HotInstrThreshold =
+      ProfileSummaryBuilder::getEntryForPercentile(
+          InstrPS.getDetailedSummary(),
+          ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx])
+          .MinCount;
+  uint64_t ColdInstrThreshold =
+      InstrProfColdThreshold
+          ? InstrProfColdThreshold
+          : ProfileSummaryBuilder::getEntryForPercentile(
+                InstrPS.getDetailedSummary(),
+                ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx])
+                .MinCount;
+
+  // Find hot/warm functions in sample profile which is cold in instr profile
+  // and adjust the profiles of those functions in the instr profile.
+  for (const auto &E : FlattenSampleMap) {
+    uint64_t SampleMaxCount = std::max(E.second.first, E.second.second);
+    if (SampleMaxCount < ColdSampleThreshold)
+      continue;
+    StringRef Name = E.first();
+    auto It = InstrProfileMap.find(Name);
+    if (It == InstrProfileMap.end()) {
+      auto NewName = StaticFuncMap.find(Name);
+      if (NewName != StaticFuncMap.end()) {
+        It = InstrProfileMap.find(NewName->second);
+        if (NewName->second == DuplicateNameStr) {
+          WithColor::warning()
+              << "Static function " << Name
+              << " has multiple promoted names, cannot adjust profile.\n";
+        }
+      }
+    }
+    if (It == InstrProfileMap.end() ||
+        It->second.MaxCount > ColdInstrThreshold ||
+        It->second.NumEdgeCounters < SupplMinSizeThreshold)
+      continue;
+    bool SetToHot = SampleMaxCount >= HotSampleThreshold;
+    updateInstrProfileEntry(It->second, SetToHot, HotInstrThreshold,
+                            ColdInstrThreshold, ZeroCounterThreshold);
+  }
+}
+
+/// The main function to supplement instr profile with sample profile.
+/// \Inputs contains the instr profile. \p SampleFilename specifies the
+/// sample profile. \p OutputFilename specifies the output profile name.
+/// \p OutputFormat specifies the output profile format. \p OutputSparse
+/// specifies whether to generate sparse profile. \p SupplMinSizeThreshold
+/// specifies the minimal size for the functions whose profile will be
+/// adjusted. \p ZeroCounterThreshold is the threshold to check whether
+/// a function contains too many zero counters and whether its profile
+/// should be dropped. \p InstrProfColdThreshold is the user specified
+/// cold threshold which will override the cold threshold got from the
+/// instr profile summary.
+static void supplementInstrProfile(const WeightedFileVector &Inputs,
+                                   StringRef SampleFilename, bool OutputSparse,
+                                   unsigned SupplMinSizeThreshold,
+                                   float ZeroCounterThreshold,
+                                   unsigned InstrProfColdThreshold) {
+  if (OutputFilename == "-")
+    exitWithError("cannot write indexed profdata format to stdout");
+  if (Inputs.size() != 1)
+    exitWithError("expect one input to be an instr profile");
+  if (Inputs[0].Weight != 1)
+    exitWithError("expect instr profile doesn't have weight");
+
+  StringRef InstrFilename = Inputs[0].Filename;
+
+  // Read sample profile.
+  LLVMContext Context;
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = sampleprof::SampleProfileReader::create(
+      SampleFilename.str(), Context, *FS, FSDiscriminatorPassOption);
+  if (std::error_code EC = ReaderOrErr.getError())
+    exitWithErrorCode(EC, SampleFilename);
+  auto Reader = std::move(ReaderOrErr.get());
+  if (std::error_code EC = Reader->read())
+    exitWithErrorCode(EC, SampleFilename);
+
+  // Read instr profile.
+  std::mutex ErrorLock;
+  SmallSet<instrprof_error, 4> WriterErrorCodes;
+  auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
+                                            WriterErrorCodes);
+  loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
+  if (WC->Errors.size() > 0)
+    exitWithError(std::move(WC->Errors[0].first), InstrFilename);
+
+  adjustInstrProfile(WC, Reader, SupplMinSizeThreshold, ZeroCounterThreshold,
+                     InstrProfColdThreshold);
+  writeInstrProfile(OutputFilename, OutputFormat, WC->Writer);
+}
+
+/// Make a copy of the given function samples with all symbol names remapped
+/// by the provided symbol remapper.
+static sampleprof::FunctionSamples
+remapSamples(const sampleprof::FunctionSamples &Samples,
+             SymbolRemapper &Remapper, sampleprof_error &Error) {
+  sampleprof::FunctionSamples Result;
+  Result.setFunction(Remapper(Samples.getFunction()));
+  Result.addTotalSamples(Samples.getTotalSamples());
+  Result.addHeadSamples(Samples.getHeadSamples());
+  for (const auto &BodySample : Samples.getBodySamples()) {
+    uint32_t MaskedDiscriminator =
+        BodySample.first.Discriminator & getDiscriminatorMask();
+    Result.addBodySamples(BodySample.first.LineOffset, MaskedDiscriminator,
+                          BodySample.second.getSamples());
+    for (const auto &Target : BodySample.second.getCallTargets()) {
+      Result.addCalledTargetSamples(BodySample.first.LineOffset,
+                                    MaskedDiscriminator,
+                                    Remapper(Target.first), Target.second);
+    }
+  }
+  for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
+    sampleprof::FunctionSamplesMap &Target =
+        Result.functionSamplesAt(CallsiteSamples.first);
+    for (const auto &Callsite : CallsiteSamples.second) {
+      sampleprof::FunctionSamples Remapped =
+          remapSamples(Callsite.second, Remapper, Error);
+      mergeSampleProfErrors(Error,
+                            Target[Remapped.getFunction()].merge(Remapped));
+    }
+  }
+  return Result;
+}
+
+static sampleprof::SampleProfileFormat FormatMap[] = {
+    sampleprof::SPF_None,
+    sampleprof::SPF_Text,
+    sampleprof::SPF_None,
+    sampleprof::SPF_Ext_Binary,
+    sampleprof::SPF_GCC,
+    sampleprof::SPF_Binary};
+
+static std::unique_ptr<MemoryBuffer>
+getInputFileBuf(const StringRef &InputFile) {
+  if (InputFile == "")
+    return {};
+
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile);
+  if (!BufOrError)
+    exitWithErrorCode(BufOrError.getError(), InputFile);
+
+  return std::move(*BufOrError);
+}
+
+static void populateProfileSymbolList(MemoryBuffer *Buffer,
+                                      sampleprof::ProfileSymbolList &PSL) {
+  if (!Buffer)
+    return;
+
+  SmallVector<StringRef, 32> SymbolVec;
+  StringRef Data = Buffer->getBuffer();
+  Data.split(SymbolVec, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+
+  for (StringRef SymbolStr : SymbolVec)
+    PSL.add(SymbolStr.trim());
+}
+
+static void handleExtBinaryWriter(sampleprof::SampleProfileWriter &Writer,
+                                  ProfileFormat OutputFormat,
+                                  MemoryBuffer *Buffer,
+                                  sampleprof::ProfileSymbolList &WriterList,
+                                  bool CompressAllSections, bool UseMD5,
+                                  bool GenPartialProfile) {
+  if (SplitLayout) {
+    if (OutputFormat == PF_Binary)
+      warn("-split-layout is ignored. Specify -extbinary to enable it");
+    else
+      Writer.setUseCtxSplitLayout();
+  }
+
+  populateProfileSymbolList(Buffer, WriterList);
+  if (WriterList.size() > 0 && OutputFormat != PF_Ext_Binary)
+    warn("Profile Symbol list is not empty but the output format is not "
+         "ExtBinary format. The list will be lost in the output. ");
+
+  Writer.setProfileSymbolList(&WriterList);
+
+  if (CompressAllSections) {
+    if (OutputFormat != PF_Ext_Binary)
+      warn("-compress-all-section is ignored. Specify -extbinary to enable it");
+    else
+      Writer.setToCompressAllSections();
+  }
+  if (UseMD5) {
+    if (OutputFormat != PF_Ext_Binary)
+      warn("-use-md5 is ignored. Specify -extbinary to enable it");
+    else
+      Writer.setUseMD5();
+  }
+  if (GenPartialProfile) {
+    if (OutputFormat != PF_Ext_Binary)
+      warn("-gen-partial-profile is ignored. Specify -extbinary to enable it");
+    else
+      Writer.setPartialProfile();
+  }
+}
+
+static void mergeSampleProfile(const WeightedFileVector &Inputs,
+                               SymbolRemapper *Remapper,
+                               StringRef ProfileSymbolListFile,
+                               size_t OutputSizeLimit) {
+  using namespace sampleprof;
+  SampleProfileMap ProfileMap;
+  SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
+  LLVMContext Context;
+  sampleprof::ProfileSymbolList WriterList;
+  std::optional<bool> ProfileIsProbeBased;
+  std::optional<bool> ProfileIsCS;
+  for (const auto &Input : Inputs) {
+    auto FS = vfs::getRealFileSystem();
+    auto ReaderOrErr = SampleProfileReader::create(Input.Filename, Context, *FS,
+                                                   FSDiscriminatorPassOption);
+    if (std::error_code EC = ReaderOrErr.getError()) {
+      warnOrExitGivenError(FailMode, EC, Input.Filename);
+      continue;
+    }
+
+    // We need to keep the readers around until after all the files are
+    // read so that we do not lose the function names stored in each
+    // reader's memory. The function names are needed to write out the
+    // merged profile map.
+    Readers.push_back(std::move(ReaderOrErr.get()));
+    const auto Reader = Readers.back().get();
+    if (std::error_code EC = Reader->read()) {
+      warnOrExitGivenError(FailMode, EC, Input.Filename);
+      Readers.pop_back();
+      continue;
+    }
+
+    SampleProfileMap &Profiles = Reader->getProfiles();
+    if (ProfileIsProbeBased &&
+        ProfileIsProbeBased != FunctionSamples::ProfileIsProbeBased)
+      exitWithError(
+          "cannot merge probe-based profile with non-probe-based profile");
+    ProfileIsProbeBased = FunctionSamples::ProfileIsProbeBased;
+    if (ProfileIsCS && ProfileIsCS != FunctionSamples::ProfileIsCS)
+      exitWithError("cannot merge CS profile with non-CS profile");
+    ProfileIsCS = FunctionSamples::ProfileIsCS;
+    for (SampleProfileMap::iterator I = Profiles.begin(), E = Profiles.end();
+         I != E; ++I) {
+      sampleprof_error Result = sampleprof_error::success;
+      FunctionSamples Remapped =
+          Remapper ? remapSamples(I->second, *Remapper, Result)
+                   : FunctionSamples();
+      FunctionSamples &Samples = Remapper ? Remapped : I->second;
+      SampleContext FContext = Samples.getContext();
+      mergeSampleProfErrors(Result,
+                            ProfileMap[FContext].merge(Samples, Input.Weight));
+      if (Result != sampleprof_error::success) {
+        std::error_code EC = make_error_code(Result);
+        handleMergeWriterError(errorCodeToError(EC), Input.Filename,
+                               FContext.toString());
+      }
+    }
+
+    if (!DropProfileSymbolList) {
+      std::unique_ptr<sampleprof::ProfileSymbolList> ReaderList =
+          Reader->getProfileSymbolList();
+      if (ReaderList)
+        WriterList.merge(*ReaderList);
+    }
+  }
+
+  if (ProfileIsCS && (SampleMergeColdContext || SampleTrimColdContext)) {
+    // Use threshold calculated from profile summary unless specified.
+    SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
+    auto Summary = Builder.computeSummaryForProfiles(ProfileMap);
+    uint64_t SampleProfColdThreshold =
+        ProfileSummaryBuilder::getColdCountThreshold(
+            (Summary->getDetailedSummary()));
+
+    // Trim and merge cold context profile using cold threshold above;
+    SampleContextTrimmer(ProfileMap)
+        .trimAndMergeColdContextProfiles(
+            SampleProfColdThreshold, SampleTrimColdContext,
+            SampleMergeColdContext, SampleColdContextFrameDepth, false);
+  }
+
+  if (ProfileLayout == llvm::sampleprof::SPL_Flat) {
+    ProfileConverter::flattenProfile(ProfileMap, FunctionSamples::ProfileIsCS);
+    ProfileIsCS = FunctionSamples::ProfileIsCS = false;
+  } else if (ProfileIsCS && ProfileLayout == llvm::sampleprof::SPL_Nest) {
+    ProfileConverter CSConverter(ProfileMap);
+    CSConverter.convertCSProfiles();
+    ProfileIsCS = FunctionSamples::ProfileIsCS = false;
+  }
+
+  filterFunctions(ProfileMap);
+
+  auto WriterOrErr =
+      SampleProfileWriter::create(OutputFilename, FormatMap[OutputFormat]);
+  if (std::error_code EC = WriterOrErr.getError())
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto Writer = std::move(WriterOrErr.get());
+  // WriterList will have StringRef refering to string in Buffer.
+  // Make sure Buffer lives as long as WriterList.
+  auto Buffer = getInputFileBuf(ProfileSymbolListFile);
+  handleExtBinaryWriter(*Writer, OutputFormat, Buffer.get(), WriterList,
+                        CompressAllSections, UseMD5, GenPartialProfile);
+
+  // If OutputSizeLimit is 0 (default), it is the same as write().
+  if (std::error_code EC =
+          Writer->writeWithSizeLimit(ProfileMap, OutputSizeLimit))
+    exitWithErrorCode(EC);
+}
+
+static WeightedFile parseWeightedFile(const StringRef &WeightedFilename) {
+  StringRef WeightStr, FileName;
+  std::tie(WeightStr, FileName) = WeightedFilename.split(',');
+
+  uint64_t Weight;
+  if (WeightStr.getAsInteger(10, Weight) || Weight < 1)
+    exitWithError("input weight must be a positive integer");
+
+  llvm::SmallString<128> ResolvedFileName;
+  llvm::sys::fs::expand_tilde(FileName, ResolvedFileName);
+
+  return {std::string(ResolvedFileName), Weight};
+}
+
+static void addWeightedInput(WeightedFileVector &WNI, const WeightedFile &WF) {
+  StringRef Filename = WF.Filename;
+  uint64_t Weight = WF.Weight;
+
+  // If it's STDIN just pass it on.
+  if (Filename == "-") {
+    WNI.push_back({std::string(Filename), Weight});
+    return;
+  }
+
+  llvm::sys::fs::file_status Status;
+  llvm::sys::fs::status(Filename, Status);
+  if (!llvm::sys::fs::exists(Status))
+    exitWithErrorCode(make_error_code(errc::no_such_file_or_directory),
+                      Filename);
+  // If it's a source file, collect it.
+  if (llvm::sys::fs::is_regular_file(Status)) {
+    WNI.push_back({std::string(Filename), Weight});
+    return;
+  }
+
+  if (llvm::sys::fs::is_directory(Status)) {
+    std::error_code EC;
+    for (llvm::sys::fs::recursive_directory_iterator F(Filename, EC), E;
+         F != E && !EC; F.increment(EC)) {
+      if (llvm::sys::fs::is_regular_file(F->path())) {
+        addWeightedInput(WNI, {F->path(), Weight});
+      }
+    }
+    if (EC)
+      exitWithErrorCode(EC, Filename);
+  }
+}
+
+static void parseInputFilenamesFile(MemoryBuffer *Buffer,
+                                    WeightedFileVector &WFV) {
+  if (!Buffer)
+    return;
+
+  SmallVector<StringRef, 8> Entries;
+  StringRef Data = Buffer->getBuffer();
+  Data.split(Entries, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  for (const StringRef &FileWeightEntry : Entries) {
+    StringRef SanitizedEntry = FileWeightEntry.trim(" \t\v\f\r");
+    // Skip comments.
+    if (SanitizedEntry.starts_with("#"))
+      continue;
+    // If there's no comma, it's an unweighted profile.
+    else if (!SanitizedEntry.contains(','))
+      addWeightedInput(WFV, {std::string(SanitizedEntry), 1});
+    else
+      addWeightedInput(WFV, parseWeightedFile(SanitizedEntry));
+  }
+}
+
+static int merge_main(StringRef ProgName) {
+  WeightedFileVector WeightedInputs;
+  for (StringRef Filename : InputFilenames)
+    addWeightedInput(WeightedInputs, {std::string(Filename), 1});
+  for (StringRef WeightedFilename : WeightedInputFilenames)
+    addWeightedInput(WeightedInputs, parseWeightedFile(WeightedFilename));
+
+  // Make sure that the file buffer stays alive for the duration of the
+  // weighted input vector's lifetime.
+  auto Buffer = getInputFileBuf(InputFilenamesFile);
+  parseInputFilenamesFile(Buffer.get(), WeightedInputs);
+
+  if (WeightedInputs.empty())
+    exitWithError("no input files specified. See " + ProgName + " merge -help");
+
+  if (DumpInputFileList) {
+    for (auto &WF : WeightedInputs)
+      outs() << WF.Weight << "," << WF.Filename << "\n";
+    return 0;
+  }
+
+  std::unique_ptr<SymbolRemapper> Remapper;
+  if (!RemappingFile.empty())
+    Remapper = SymbolRemapper::create(RemappingFile);
+
+  if (!SupplInstrWithSample.empty()) {
+    if (ProfileKind != instr)
+      exitWithError(
+          "-supplement-instr-with-sample can only work with -instr. ");
+
+    supplementInstrProfile(WeightedInputs, SupplInstrWithSample, OutputSparse,
+                           SupplMinSizeThreshold, ZeroCounterThreshold,
+                           InstrProfColdThreshold);
+    return 0;
+  }
+
+  if (ProfileKind == instr)
+    mergeInstrProfile(WeightedInputs, Remapper.get(), MaxDbgCorrelationWarnings,
+                      ProfiledBinary);
+  else
+    mergeSampleProfile(WeightedInputs, Remapper.get(), ProfileSymbolListFile,
+                       OutputSizeLimit);
+  return 0;
+}
+
+/// Computer the overlap b/w profile BaseFilename and profile TestFilename.
+static void overlapInstrProfile(const std::string &BaseFilename,
+                                const std::string &TestFilename,
+                                const OverlapFuncFilters &FuncFilter,
+                                raw_fd_ostream &OS, bool IsCS) {
+  std::mutex ErrorLock;
+  SmallSet<instrprof_error, 4> WriterErrorCodes;
+  WriterContext Context(false, ErrorLock, WriterErrorCodes);
+  WeightedFile WeightedInput{BaseFilename, 1};
+  OverlapStats Overlap;
+  Error E = Overlap.accumulateCounts(BaseFilename, TestFilename, IsCS);
+  if (E)
+    exitWithError(std::move(E), "error in getting profile count sums");
+  if (Overlap.Base.CountSum < 1.0f) {
+    OS << "Sum of edge counts for profile " << BaseFilename << " is 0.\n";
+    exit(0);
+  }
+  if (Overlap.Test.CountSum < 1.0f) {
+    OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n";
+    exit(0);
+  }
+  loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context);
+  overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS,
+               IsCS);
+  Overlap.dump(OS);
+}
+
+namespace {
+struct SampleOverlapStats {
+  SampleContext BaseName;
+  SampleContext TestName;
+  // Number of overlap units
+  uint64_t OverlapCount = 0;
+  // Total samples of overlap units
+  uint64_t OverlapSample = 0;
+  // Number of and total samples of units that only present in base or test
+  // profile
+  uint64_t BaseUniqueCount = 0;
+  uint64_t BaseUniqueSample = 0;
+  uint64_t TestUniqueCount = 0;
+  uint64_t TestUniqueSample = 0;
+  // Number of units and total samples in base or test profile
+  uint64_t BaseCount = 0;
+  uint64_t BaseSample = 0;
+  uint64_t TestCount = 0;
+  uint64_t TestSample = 0;
+  // Number of and total samples of units that present in at least one profile
+  uint64_t UnionCount = 0;
+  uint64_t UnionSample = 0;
+  // Weighted similarity
+  double Similarity = 0.0;
+  // For SampleOverlapStats instances representing functions, weights of the
+  // function in base and test profiles
+  double BaseWeight = 0.0;
+  double TestWeight = 0.0;
+
+  SampleOverlapStats() = default;
+};
+} // end anonymous namespace
+
+namespace {
+struct FuncSampleStats {
+  uint64_t SampleSum = 0;
+  uint64_t MaxSample = 0;
+  uint64_t HotBlockCount = 0;
+  FuncSampleStats() = default;
+  FuncSampleStats(uint64_t SampleSum, uint64_t MaxSample,
+                  uint64_t HotBlockCount)
+      : SampleSum(SampleSum), MaxSample(MaxSample),
+        HotBlockCount(HotBlockCount) {}
+};
+} // end anonymous namespace
+
+namespace {
+enum MatchStatus { MS_Match, MS_FirstUnique, MS_SecondUnique, MS_None };
+
+// Class for updating merging steps for two sorted maps. The class should be
+// instantiated with a map iterator type.
+template <class T> class MatchStep {
+public:
+  MatchStep() = delete;
+
+  MatchStep(T FirstIter, T FirstEnd, T SecondIter, T SecondEnd)
+      : FirstIter(FirstIter), FirstEnd(FirstEnd), SecondIter(SecondIter),
+        SecondEnd(SecondEnd), Status(MS_None) {}
+
+  bool areBothFinished() const {
+    return (FirstIter == FirstEnd && SecondIter == SecondEnd);
+  }
+
+  bool isFirstFinished() const { return FirstIter == FirstEnd; }
+
+  bool isSecondFinished() const { return SecondIter == SecondEnd; }
+
+  /// Advance one step based on the previous match status unless the previous
+  /// status is MS_None. Then update Status based on the comparison between two
+  /// container iterators at the current step. If the previous status is
+  /// MS_None, it means two iterators are at the beginning and no comparison has
+  /// been made, so we simply update Status without advancing the iterators.
+  void updateOneStep();
+
+  T getFirstIter() const { return FirstIter; }
+
+  T getSecondIter() const { return SecondIter; }
+
+  MatchStatus getMatchStatus() const { return Status; }
+
+private:
+  // Current iterator and end iterator of the first container.
+  T FirstIter;
+  T FirstEnd;
+  // Current iterator and end iterator of the second container.
+  T SecondIter;
+  T SecondEnd;
+  // Match status of the current step.
+  MatchStatus Status;
+};
+} // end anonymous namespace
+
+template <class T> void MatchStep<T>::updateOneStep() {
+  switch (Status) {
+  case MS_Match:
+    ++FirstIter;
+    ++SecondIter;
+    break;
+  case MS_FirstUnique:
+    ++FirstIter;
+    break;
+  case MS_SecondUnique:
+    ++SecondIter;
+    break;
+  case MS_None:
+    break;
+  }
+
+  // Update Status according to iterators at the current step.
+  if (areBothFinished())
+    return;
+  if (FirstIter != FirstEnd &&
+      (SecondIter == SecondEnd || FirstIter->first < SecondIter->first))
+    Status = MS_FirstUnique;
+  else if (SecondIter != SecondEnd &&
+           (FirstIter == FirstEnd || SecondIter->first < FirstIter->first))
+    Status = MS_SecondUnique;
+  else
+    Status = MS_Match;
+}
+
+// Return the sum of line/block samples, the max line/block sample, and the
+// number of line/block samples above the given threshold in a function
+// including its inlinees.
+static void getFuncSampleStats(const sampleprof::FunctionSamples &Func,
+                               FuncSampleStats &FuncStats,
+                               uint64_t HotThreshold) {
+  for (const auto &L : Func.getBodySamples()) {
+    uint64_t Sample = L.second.getSamples();
+    FuncStats.SampleSum += Sample;
+    FuncStats.MaxSample = std::max(FuncStats.MaxSample, Sample);
+    if (Sample >= HotThreshold)
+      ++FuncStats.HotBlockCount;
+  }
+
+  for (const auto &C : Func.getCallsiteSamples()) {
+    for (const auto &F : C.second)
+      getFuncSampleStats(F.second, FuncStats, HotThreshold);
+  }
+}
+
+/// Predicate that determines if a function is hot with a given threshold. We
+/// keep it separate from its callsites for possible extension in the future.
+static bool isFunctionHot(const FuncSampleStats &FuncStats,
+                          uint64_t HotThreshold) {
+  // We intentionally compare the maximum sample count in a function with the
+  // HotThreshold to get an approximate determination on hot functions.
+  return (FuncStats.MaxSample >= HotThreshold);
+}
+
+namespace {
+class SampleOverlapAggregator {
+public:
+  SampleOverlapAggregator(const std::string &BaseFilename,
+                          const std::string &TestFilename,
+                          double LowSimilarityThreshold, double Epsilon,
+                          const OverlapFuncFilters &FuncFilter)
+      : BaseFilename(BaseFilename), TestFilename(TestFilename),
+        LowSimilarityThreshold(LowSimilarityThreshold), Epsilon(Epsilon),
+        FuncFilter(FuncFilter) {}
+
+  /// Detect 0-sample input profile and report to output stream. This interface
+  /// should be called after loadProfiles().
+  bool detectZeroSampleProfile(raw_fd_ostream &OS) const;
+
+  /// Write out function-level similarity statistics for functions specified by
+  /// options --function, --value-cutoff, and --similarity-cutoff.
+  void dumpFuncSimilarity(raw_fd_ostream &OS) const;
+
+  /// Write out program-level similarity and overlap statistics.
+  void dumpProgramSummary(raw_fd_ostream &OS) const;
+
+  /// Write out hot-function and hot-block statistics for base_profile,
+  /// test_profile, and their overlap. For both cases, the overlap HO is
+  /// calculated as follows:
+  ///    Given the number of functions (or blocks) that are hot in both profiles
+  ///    HCommon and the number of functions (or blocks) that are hot in at
+  ///    least one profile HUnion, HO = HCommon / HUnion.
+  void dumpHotFuncAndBlockOverlap(raw_fd_ostream &OS) const;
+
+  /// This function tries matching functions in base and test profiles. For each
+  /// pair of matched functions, it aggregates the function-level
+  /// similarity into a profile-level similarity. It also dump function-level
+  /// similarity information of functions specified by --function,
+  /// --value-cutoff, and --similarity-cutoff options. The program-level
+  /// similarity PS is computed as follows:
+  ///     Given function-level similarity FS(A) for all function A, the
+  ///     weight of function A in base profile WB(A), and the weight of function
+  ///     A in test profile WT(A), compute PS(base_profile, test_profile) =
+  ///     sum_A(FS(A) * avg(WB(A), WT(A))) ranging in [0.0f to 1.0f] with 0.0
+  ///     meaning no-overlap.
+  void computeSampleProfileOverlap(raw_fd_ostream &OS);
+
+  /// Initialize ProfOverlap with the sum of samples in base and test
+  /// profiles. This function also computes and keeps the sum of samples and
+  /// max sample counts of each function in BaseStats and TestStats for later
+  /// use to avoid re-computations.
+  void initializeSampleProfileOverlap();
+
+  /// Load profiles specified by BaseFilename and TestFilename.
+  std::error_code loadProfiles();
+
+  using FuncSampleStatsMap =
+      std::unordered_map<SampleContext, FuncSampleStats, SampleContext::Hash>;
+
+private:
+  SampleOverlapStats ProfOverlap;
+  SampleOverlapStats HotFuncOverlap;
+  SampleOverlapStats HotBlockOverlap;
+  std::string BaseFilename;
+  std::string TestFilename;
+  std::unique_ptr<sampleprof::SampleProfileReader> BaseReader;
+  std::unique_ptr<sampleprof::SampleProfileReader> TestReader;
+  // BaseStats and TestStats hold FuncSampleStats for each function, with
+  // function name as the key.
+  FuncSampleStatsMap BaseStats;
+  FuncSampleStatsMap TestStats;
+  // Low similarity threshold in floating point number
+  double LowSimilarityThreshold;
+  // Block samples above BaseHotThreshold or TestHotThreshold are considered hot
+  // for tracking hot blocks.
+  uint64_t BaseHotThreshold;
+  uint64_t TestHotThreshold;
+  // A small threshold used to round the results of floating point accumulations
+  // to resolve imprecision.
+  const double Epsilon;
+  std::multimap<double, SampleOverlapStats, std::greater<double>>
+      FuncSimilarityDump;
+  // FuncFilter carries specifications in options --value-cutoff and
+  // --function.
+  OverlapFuncFilters FuncFilter;
+  // Column offsets for printing the function-level details table.
+  static const unsigned int TestWeightCol = 15;
+  static const unsigned int SimilarityCol = 30;
+  static const unsigned int OverlapCol = 43;
+  static const unsigned int BaseUniqueCol = 53;
+  static const unsigned int TestUniqueCol = 67;
+  static const unsigned int BaseSampleCol = 81;
+  static const unsigned int TestSampleCol = 96;
+  static const unsigned int FuncNameCol = 111;
+
+  /// Return a similarity of two line/block sample counters in the same
+  /// function in base and test profiles. The line/block-similarity BS(i) is
+  /// computed as follows:
+  ///    For an offsets i, given the sample count at i in base profile BB(i),
+  ///    the sample count at i in test profile BT(i), the sum of sample counts
+  ///    in this function in base profile SB, and the sum of sample counts in
+  ///    this function in test profile ST, compute BS(i) = 1.0 - fabs(BB(i)/SB -
+  ///    BT(i)/ST), ranging in [0.0f to 1.0f] with 0.0 meaning no-overlap.
+  double computeBlockSimilarity(uint64_t BaseSample, uint64_t TestSample,
+                                const SampleOverlapStats &FuncOverlap) const;
+
+  void updateHotBlockOverlap(uint64_t BaseSample, uint64_t TestSample,
+                             uint64_t HotBlockCount);
+
+  void getHotFunctions(const FuncSampleStatsMap &ProfStats,
+                       FuncSampleStatsMap &HotFunc,
+                       uint64_t HotThreshold) const;
+
+  void computeHotFuncOverlap();
+
+  /// This function updates statistics in FuncOverlap, HotBlockOverlap, and
+  /// Difference for two sample units in a matched function according to the
+  /// given match status.
+  void updateOverlapStatsForFunction(uint64_t BaseSample, uint64_t TestSample,
+                                     uint64_t HotBlockCount,
+                                     SampleOverlapStats &FuncOverlap,
+                                     double &Difference, MatchStatus Status);
+
+  /// This function updates statistics in FuncOverlap, HotBlockOverlap, and
+  /// Difference for unmatched callees that only present in one profile in a
+  /// matched caller function.
+  void updateForUnmatchedCallee(const sampleprof::FunctionSamples &Func,
+                                SampleOverlapStats &FuncOverlap,
+                                double &Difference, MatchStatus Status);
+
+  /// This function updates sample overlap statistics of an overlap function in
+  /// base and test profile. It also calculates a function-internal similarity
+  /// FIS as follows:
+  ///    For offsets i that have samples in at least one profile in this
+  ///    function A, given BS(i) returned by computeBlockSimilarity(), compute
+  ///    FIS(A) = (2.0 - sum_i(1.0 - BS(i))) / 2, ranging in [0.0f to 1.0f] with
+  ///    0.0 meaning no overlap.
+  double computeSampleFunctionInternalOverlap(
+      const sampleprof::FunctionSamples &BaseFunc,
+      const sampleprof::FunctionSamples &TestFunc,
+      SampleOverlapStats &FuncOverlap);
+
+  /// Function-level similarity (FS) is a weighted value over function internal
+  /// similarity (FIS). This function computes a function's FS from its FIS by
+  /// applying the weight.
+  double weightForFuncSimilarity(double FuncSimilarity, uint64_t BaseFuncSample,
+                                 uint64_t TestFuncSample) const;
+
+  /// The function-level similarity FS(A) for a function A is computed as
+  /// follows:
+  ///     Compute a function-internal similarity FIS(A) by
+  ///     computeSampleFunctionInternalOverlap(). Then, with the weight of
+  ///     function A in base profile WB(A), and the weight of function A in test
+  ///     profile WT(A), compute FS(A) = FIS(A) * (1.0 - fabs(WB(A) - WT(A)))
+  ///     ranging in [0.0f to 1.0f] with 0.0 meaning no overlap.
+  double
+  computeSampleFunctionOverlap(const sampleprof::FunctionSamples *BaseFunc,
+                               const sampleprof::FunctionSamples *TestFunc,
+                               SampleOverlapStats *FuncOverlap,
+                               uint64_t BaseFuncSample,
+                               uint64_t TestFuncSample);
+
+  /// Profile-level similarity (PS) is a weighted aggregate over function-level
+  /// similarities (FS). This method weights the FS value by the function
+  /// weights in the base and test profiles for the aggregation.
+  double weightByImportance(double FuncSimilarity, uint64_t BaseFuncSample,
+                            uint64_t TestFuncSample) const;
+};
+} // end anonymous namespace
+
+bool SampleOverlapAggregator::detectZeroSampleProfile(
+    raw_fd_ostream &OS) const {
+  bool HaveZeroSample = false;
+  if (ProfOverlap.BaseSample == 0) {
+    OS << "Sum of sample counts for profile " << BaseFilename << " is 0.\n";
+    HaveZeroSample = true;
+  }
+  if (ProfOverlap.TestSample == 0) {
+    OS << "Sum of sample counts for profile " << TestFilename << " is 0.\n";
+    HaveZeroSample = true;
+  }
+  return HaveZeroSample;
+}
+
+double SampleOverlapAggregator::computeBlockSimilarity(
+    uint64_t BaseSample, uint64_t TestSample,
+    const SampleOverlapStats &FuncOverlap) const {
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  if (FuncOverlap.BaseSample > 0)
+    BaseFrac = static_cast<double>(BaseSample) / FuncOverlap.BaseSample;
+  if (FuncOverlap.TestSample > 0)
+    TestFrac = static_cast<double>(TestSample) / FuncOverlap.TestSample;
+  return 1.0 - std::fabs(BaseFrac - TestFrac);
+}
+
+void SampleOverlapAggregator::updateHotBlockOverlap(uint64_t BaseSample,
+                                                    uint64_t TestSample,
+                                                    uint64_t HotBlockCount) {
+  bool IsBaseHot = (BaseSample >= BaseHotThreshold);
+  bool IsTestHot = (TestSample >= TestHotThreshold);
+  if (!IsBaseHot && !IsTestHot)
+    return;
+
+  HotBlockOverlap.UnionCount += HotBlockCount;
+  if (IsBaseHot)
+    HotBlockOverlap.BaseCount += HotBlockCount;
+  if (IsTestHot)
+    HotBlockOverlap.TestCount += HotBlockCount;
+  if (IsBaseHot && IsTestHot)
+    HotBlockOverlap.OverlapCount += HotBlockCount;
+}
+
+void SampleOverlapAggregator::getHotFunctions(
+    const FuncSampleStatsMap &ProfStats, FuncSampleStatsMap &HotFunc,
+    uint64_t HotThreshold) const {
+  for (const auto &F : ProfStats) {
+    if (isFunctionHot(F.second, HotThreshold))
+      HotFunc.emplace(F.first, F.second);
+  }
+}
+
+void SampleOverlapAggregator::computeHotFuncOverlap() {
+  FuncSampleStatsMap BaseHotFunc;
+  getHotFunctions(BaseStats, BaseHotFunc, BaseHotThreshold);
+  HotFuncOverlap.BaseCount = BaseHotFunc.size();
+
+  FuncSampleStatsMap TestHotFunc;
+  getHotFunctions(TestStats, TestHotFunc, TestHotThreshold);
+  HotFuncOverlap.TestCount = TestHotFunc.size();
+  HotFuncOverlap.UnionCount = HotFuncOverlap.TestCount;
+
+  for (const auto &F : BaseHotFunc) {
+    if (TestHotFunc.count(F.first))
+      ++HotFuncOverlap.OverlapCount;
+    else
+      ++HotFuncOverlap.UnionCount;
+  }
+}
+
+void SampleOverlapAggregator::updateOverlapStatsForFunction(
+    uint64_t BaseSample, uint64_t TestSample, uint64_t HotBlockCount,
+    SampleOverlapStats &FuncOverlap, double &Difference, MatchStatus Status) {
+  assert(Status != MS_None &&
+         "Match status should be updated before updating overlap statistics");
+  if (Status == MS_FirstUnique) {
+    TestSample = 0;
+    FuncOverlap.BaseUniqueSample += BaseSample;
+  } else if (Status == MS_SecondUnique) {
+    BaseSample = 0;
+    FuncOverlap.TestUniqueSample += TestSample;
+  } else {
+    ++FuncOverlap.OverlapCount;
+  }
+
+  FuncOverlap.UnionSample += std::max(BaseSample, TestSample);
+  FuncOverlap.OverlapSample += std::min(BaseSample, TestSample);
+  Difference +=
+      1.0 - computeBlockSimilarity(BaseSample, TestSample, FuncOverlap);
+  updateHotBlockOverlap(BaseSample, TestSample, HotBlockCount);
+}
+
+void SampleOverlapAggregator::updateForUnmatchedCallee(
+    const sampleprof::FunctionSamples &Func, SampleOverlapStats &FuncOverlap,
+    double &Difference, MatchStatus Status) {
+  assert((Status == MS_FirstUnique || Status == MS_SecondUnique) &&
+         "Status must be either of the two unmatched cases");
+  FuncSampleStats FuncStats;
+  if (Status == MS_FirstUnique) {
+    getFuncSampleStats(Func, FuncStats, BaseHotThreshold);
+    updateOverlapStatsForFunction(FuncStats.SampleSum, 0,
+                                  FuncStats.HotBlockCount, FuncOverlap,
+                                  Difference, Status);
+  } else {
+    getFuncSampleStats(Func, FuncStats, TestHotThreshold);
+    updateOverlapStatsForFunction(0, FuncStats.SampleSum,
+                                  FuncStats.HotBlockCount, FuncOverlap,
+                                  Difference, Status);
+  }
+}
+
+double SampleOverlapAggregator::computeSampleFunctionInternalOverlap(
+    const sampleprof::FunctionSamples &BaseFunc,
+    const sampleprof::FunctionSamples &TestFunc,
+    SampleOverlapStats &FuncOverlap) {
+
+  using namespace sampleprof;
+
+  double Difference = 0;
+
+  // Accumulate Difference for regular line/block samples in the function.
+  // We match them through sort-merge join algorithm because
+  // FunctionSamples::getBodySamples() returns a map of sample counters ordered
+  // by their offsets.
+  MatchStep<BodySampleMap::const_iterator> BlockIterStep(
+      BaseFunc.getBodySamples().cbegin(), BaseFunc.getBodySamples().cend(),
+      TestFunc.getBodySamples().cbegin(), TestFunc.getBodySamples().cend());
+  BlockIterStep.updateOneStep();
+  while (!BlockIterStep.areBothFinished()) {
+    uint64_t BaseSample =
+        BlockIterStep.isFirstFinished()
+            ? 0
+            : BlockIterStep.getFirstIter()->second.getSamples();
+    uint64_t TestSample =
+        BlockIterStep.isSecondFinished()
+            ? 0
+            : BlockIterStep.getSecondIter()->second.getSamples();
+    updateOverlapStatsForFunction(BaseSample, TestSample, 1, FuncOverlap,
+                                  Difference, BlockIterStep.getMatchStatus());
+
+    BlockIterStep.updateOneStep();
+  }
+
+  // Accumulate Difference for callsite lines in the function. We match
+  // them through sort-merge algorithm because
+  // FunctionSamples::getCallsiteSamples() returns a map of callsite records
+  // ordered by their offsets.
+  MatchStep<CallsiteSampleMap::const_iterator> CallsiteIterStep(
+      BaseFunc.getCallsiteSamples().cbegin(),
+      BaseFunc.getCallsiteSamples().cend(),
+      TestFunc.getCallsiteSamples().cbegin(),
+      TestFunc.getCallsiteSamples().cend());
+  CallsiteIterStep.updateOneStep();
+  while (!CallsiteIterStep.areBothFinished()) {
+    MatchStatus CallsiteStepStatus = CallsiteIterStep.getMatchStatus();
+    assert(CallsiteStepStatus != MS_None &&
+           "Match status should be updated before entering loop body");
+
+    if (CallsiteStepStatus != MS_Match) {
+      auto Callsite = (CallsiteStepStatus == MS_FirstUnique)
+                          ? CallsiteIterStep.getFirstIter()
+                          : CallsiteIterStep.getSecondIter();
+      for (const auto &F : Callsite->second)
+        updateForUnmatchedCallee(F.second, FuncOverlap, Difference,
+                                 CallsiteStepStatus);
+    } else {
+      // There may be multiple inlinees at the same offset, so we need to try
+      // matching all of them. This match is implemented through sort-merge
+      // algorithm because callsite records at the same offset are ordered by
+      // function names.
+      MatchStep<FunctionSamplesMap::const_iterator> CalleeIterStep(
+          CallsiteIterStep.getFirstIter()->second.cbegin(),
+          CallsiteIterStep.getFirstIter()->second.cend(),
+          CallsiteIterStep.getSecondIter()->second.cbegin(),
+          CallsiteIterStep.getSecondIter()->second.cend());
+      CalleeIterStep.updateOneStep();
+      while (!CalleeIterStep.areBothFinished()) {
+        MatchStatus CalleeStepStatus = CalleeIterStep.getMatchStatus();
+        if (CalleeStepStatus != MS_Match) {
+          auto Callee = (CalleeStepStatus == MS_FirstUnique)
+                            ? CalleeIterStep.getFirstIter()
+                            : CalleeIterStep.getSecondIter();
+          updateForUnmatchedCallee(Callee->second, FuncOverlap, Difference,
+                                   CalleeStepStatus);
+        } else {
+          // An inlined function can contain other inlinees inside, so compute
+          // the Difference recursively.
+          Difference += 2.0 - 2 * computeSampleFunctionInternalOverlap(
+                                      CalleeIterStep.getFirstIter()->second,
+                                      CalleeIterStep.getSecondIter()->second,
+                                      FuncOverlap);
+        }
+        CalleeIterStep.updateOneStep();
+      }
+    }
+    CallsiteIterStep.updateOneStep();
+  }
+
+  // Difference reflects the total differences of line/block samples in this
+  // function and ranges in [0.0f to 2.0f]. Take (2.0 - Difference) / 2 to
+  // reflect the similarity between function profiles in [0.0f to 1.0f].
+  return (2.0 - Difference) / 2;
+}
+
+double SampleOverlapAggregator::weightForFuncSimilarity(
+    double FuncInternalSimilarity, uint64_t BaseFuncSample,
+    uint64_t TestFuncSample) const {
+  // Compute the weight as the distance between the function weights in two
+  // profiles.
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  BaseFrac = static_cast<double>(BaseFuncSample) / ProfOverlap.BaseSample;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  TestFrac = static_cast<double>(TestFuncSample) / ProfOverlap.TestSample;
+  double WeightDistance = std::fabs(BaseFrac - TestFrac);
+
+  // Take WeightDistance into the similarity.
+  return FuncInternalSimilarity * (1 - WeightDistance);
+}
+
+double
+SampleOverlapAggregator::weightByImportance(double FuncSimilarity,
+                                            uint64_t BaseFuncSample,
+                                            uint64_t TestFuncSample) const {
+
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  BaseFrac = static_cast<double>(BaseFuncSample) / ProfOverlap.BaseSample / 2.0;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  TestFrac = static_cast<double>(TestFuncSample) / ProfOverlap.TestSample / 2.0;
+  return FuncSimilarity * (BaseFrac + TestFrac);
+}
+
+double SampleOverlapAggregator::computeSampleFunctionOverlap(
+    const sampleprof::FunctionSamples *BaseFunc,
+    const sampleprof::FunctionSamples *TestFunc,
+    SampleOverlapStats *FuncOverlap, uint64_t BaseFuncSample,
+    uint64_t TestFuncSample) {
+  // Default function internal similarity before weighted, meaning two functions
+  // has no overlap.
+  const double DefaultFuncInternalSimilarity = 0;
+  double FuncSimilarity;
+  double FuncInternalSimilarity;
+
+  // If BaseFunc or TestFunc is nullptr, it means the functions do not overlap.
+  // In this case, we use DefaultFuncInternalSimilarity as the function internal
+  // similarity.
+  if (!BaseFunc || !TestFunc) {
+    FuncInternalSimilarity = DefaultFuncInternalSimilarity;
+  } else {
+    assert(FuncOverlap != nullptr &&
+           "FuncOverlap should be provided in this case");
+    FuncInternalSimilarity = computeSampleFunctionInternalOverlap(
+        *BaseFunc, *TestFunc, *FuncOverlap);
+    // Now, FuncInternalSimilarity may be a little less than 0 due to
+    // imprecision of floating point accumulations. Make it zero if the
+    // difference is below Epsilon.
+    FuncInternalSimilarity = (std::fabs(FuncInternalSimilarity - 0) < Epsilon)
+                                 ? 0
+                                 : FuncInternalSimilarity;
+  }
+  FuncSimilarity = weightForFuncSimilarity(FuncInternalSimilarity,
+                                           BaseFuncSample, TestFuncSample);
+  return FuncSimilarity;
+}
+
+void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
+  using namespace sampleprof;
+
+  std::unordered_map<SampleContext, const FunctionSamples *,
+                     SampleContext::Hash>
+      BaseFuncProf;
+  const auto &BaseProfiles = BaseReader->getProfiles();
+  for (const auto &BaseFunc : BaseProfiles) {
+    BaseFuncProf.emplace(BaseFunc.second.getContext(), &(BaseFunc.second));
+  }
+  ProfOverlap.UnionCount = BaseFuncProf.size();
+
+  const auto &TestProfiles = TestReader->getProfiles();
+  for (const auto &TestFunc : TestProfiles) {
+    SampleOverlapStats FuncOverlap;
+    FuncOverlap.TestName = TestFunc.second.getContext();
+    assert(TestStats.count(FuncOverlap.TestName) &&
+           "TestStats should have records for all functions in test profile "
+           "except inlinees");
+    FuncOverlap.TestSample = TestStats[FuncOverlap.TestName].SampleSum;
+
+    bool Matched = false;
+    const auto Match = BaseFuncProf.find(FuncOverlap.TestName);
+    if (Match == BaseFuncProf.end()) {
+      const FuncSampleStats &FuncStats = TestStats[FuncOverlap.TestName];
+      ++ProfOverlap.TestUniqueCount;
+      ProfOverlap.TestUniqueSample += FuncStats.SampleSum;
+      FuncOverlap.TestUniqueSample = FuncStats.SampleSum;
+
+      updateHotBlockOverlap(0, FuncStats.SampleSum, FuncStats.HotBlockCount);
+
+      double FuncSimilarity = computeSampleFunctionOverlap(
+          nullptr, nullptr, nullptr, 0, FuncStats.SampleSum);
+      ProfOverlap.Similarity +=
+          weightByImportance(FuncSimilarity, 0, FuncStats.SampleSum);
+
+      ++ProfOverlap.UnionCount;
+      ProfOverlap.UnionSample += FuncStats.SampleSum;
+    } else {
+      ++ProfOverlap.OverlapCount;
+
+      // Two functions match with each other. Compute function-level overlap and
+      // aggregate them into profile-level overlap.
+      FuncOverlap.BaseName = Match->second->getContext();
+      assert(BaseStats.count(FuncOverlap.BaseName) &&
+             "BaseStats should have records for all functions in base profile "
+             "except inlinees");
+      FuncOverlap.BaseSample = BaseStats[FuncOverlap.BaseName].SampleSum;
+
+      FuncOverlap.Similarity = computeSampleFunctionOverlap(
+          Match->second, &TestFunc.second, &FuncOverlap, FuncOverlap.BaseSample,
+          FuncOverlap.TestSample);
+      ProfOverlap.Similarity +=
+          weightByImportance(FuncOverlap.Similarity, FuncOverlap.BaseSample,
+                             FuncOverlap.TestSample);
+      ProfOverlap.OverlapSample += FuncOverlap.OverlapSample;
+      ProfOverlap.UnionSample += FuncOverlap.UnionSample;
+
+      // Accumulate the percentage of base unique and test unique samples into
+      // ProfOverlap.
+      ProfOverlap.BaseUniqueSample += FuncOverlap.BaseUniqueSample;
+      ProfOverlap.TestUniqueSample += FuncOverlap.TestUniqueSample;
+
+      // Remove matched base functions for later reporting functions not found
+      // in test profile.
+      BaseFuncProf.erase(Match);
+      Matched = true;
+    }
+
+    // Print function-level similarity information if specified by options.
+    assert(TestStats.count(FuncOverlap.TestName) &&
+           "TestStats should have records for all functions in test profile "
+           "except inlinees");
+    if (TestStats[FuncOverlap.TestName].MaxSample >= FuncFilter.ValueCutoff ||
+        (Matched && FuncOverlap.Similarity < LowSimilarityThreshold) ||
+        (Matched && !FuncFilter.NameFilter.empty() &&
+         FuncOverlap.BaseName.toString().find(FuncFilter.NameFilter) !=
+             std::string::npos)) {
+      assert(ProfOverlap.BaseSample > 0 &&
+             "Total samples in base profile should be greater than 0");
+      FuncOverlap.BaseWeight =
+          static_cast<double>(FuncOverlap.BaseSample) / ProfOverlap.BaseSample;
+      assert(ProfOverlap.TestSample > 0 &&
+             "Total samples in test profile should be greater than 0");
+      FuncOverlap.TestWeight =
+          static_cast<double>(FuncOverlap.TestSample) / ProfOverlap.TestSample;
+      FuncSimilarityDump.emplace(FuncOverlap.BaseWeight, FuncOverlap);
+    }
+  }
+
+  // Traverse through functions in base profile but not in test profile.
+  for (const auto &F : BaseFuncProf) {
+    assert(BaseStats.count(F.second->getContext()) &&
+           "BaseStats should have records for all functions in base profile "
+           "except inlinees");
+    const FuncSampleStats &FuncStats = BaseStats[F.second->getContext()];
+    ++ProfOverlap.BaseUniqueCount;
+    ProfOverlap.BaseUniqueSample += FuncStats.SampleSum;
+
+    updateHotBlockOverlap(FuncStats.SampleSum, 0, FuncStats.HotBlockCount);
+
+    double FuncSimilarity = computeSampleFunctionOverlap(
+        nullptr, nullptr, nullptr, FuncStats.SampleSum, 0);
+    ProfOverlap.Similarity +=
+        weightByImportance(FuncSimilarity, FuncStats.SampleSum, 0);
+
+    ProfOverlap.UnionSample += FuncStats.SampleSum;
+  }
+
+  // Now, ProfSimilarity may be a little greater than 1 due to imprecision
+  // of floating point accumulations. Make it 1.0 if the difference is below
+  // Epsilon.
+  ProfOverlap.Similarity = (std::fabs(ProfOverlap.Similarity - 1) < Epsilon)
+                               ? 1
+                               : ProfOverlap.Similarity;
+
+  computeHotFuncOverlap();
+}
+
+void SampleOverlapAggregator::initializeSampleProfileOverlap() {
+  const auto &BaseProf = BaseReader->getProfiles();
+  for (const auto &I : BaseProf) {
+    ++ProfOverlap.BaseCount;
+    FuncSampleStats FuncStats;
+    getFuncSampleStats(I.second, FuncStats, BaseHotThreshold);
+    ProfOverlap.BaseSample += FuncStats.SampleSum;
+    BaseStats.emplace(I.second.getContext(), FuncStats);
+  }
+
+  const auto &TestProf = TestReader->getProfiles();
+  for (const auto &I : TestProf) {
+    ++ProfOverlap.TestCount;
+    FuncSampleStats FuncStats;
+    getFuncSampleStats(I.second, FuncStats, TestHotThreshold);
+    ProfOverlap.TestSample += FuncStats.SampleSum;
+    TestStats.emplace(I.second.getContext(), FuncStats);
+  }
+
+  ProfOverlap.BaseName = StringRef(BaseFilename);
+  ProfOverlap.TestName = StringRef(TestFilename);
+}
+
+void SampleOverlapAggregator::dumpFuncSimilarity(raw_fd_ostream &OS) const {
+  using namespace sampleprof;
+
+  if (FuncSimilarityDump.empty())
+    return;
+
+  formatted_raw_ostream FOS(OS);
+  FOS << "Function-level details:\n";
+  FOS << "Base weight";
+  FOS.PadToColumn(TestWeightCol);
+  FOS << "Test weight";
+  FOS.PadToColumn(SimilarityCol);
+  FOS << "Similarity";
+  FOS.PadToColumn(OverlapCol);
+  FOS << "Overlap";
+  FOS.PadToColumn(BaseUniqueCol);
+  FOS << "Base unique";
+  FOS.PadToColumn(TestUniqueCol);
+  FOS << "Test unique";
+  FOS.PadToColumn(BaseSampleCol);
+  FOS << "Base samples";
+  FOS.PadToColumn(TestSampleCol);
+  FOS << "Test samples";
+  FOS.PadToColumn(FuncNameCol);
+  FOS << "Function name\n";
+  for (const auto &F : FuncSimilarityDump) {
+    double OverlapPercent =
+        F.second.UnionSample > 0
+            ? static_cast<double>(F.second.OverlapSample) / F.second.UnionSample
+            : 0;
+    double BaseUniquePercent =
+        F.second.BaseSample > 0
+            ? static_cast<double>(F.second.BaseUniqueSample) /
+                  F.second.BaseSample
+            : 0;
+    double TestUniquePercent =
+        F.second.TestSample > 0
+            ? static_cast<double>(F.second.TestUniqueSample) /
+                  F.second.TestSample
+            : 0;
+
+    FOS << format("%.2f%%", F.second.BaseWeight * 100);
+    FOS.PadToColumn(TestWeightCol);
+    FOS << format("%.2f%%", F.second.TestWeight * 100);
+    FOS.PadToColumn(SimilarityCol);
+    FOS << format("%.2f%%", F.second.Similarity * 100);
+    FOS.PadToColumn(OverlapCol);
+    FOS << format("%.2f%%", OverlapPercent * 100);
+    FOS.PadToColumn(BaseUniqueCol);
+    FOS << format("%.2f%%", BaseUniquePercent * 100);
+    FOS.PadToColumn(TestUniqueCol);
+    FOS << format("%.2f%%", TestUniquePercent * 100);
+    FOS.PadToColumn(BaseSampleCol);
+    FOS << F.second.BaseSample;
+    FOS.PadToColumn(TestSampleCol);
+    FOS << F.second.TestSample;
+    FOS.PadToColumn(FuncNameCol);
+    FOS << F.second.TestName.toString() << "\n";
+  }
+}
+
+void SampleOverlapAggregator::dumpProgramSummary(raw_fd_ostream &OS) const {
+  OS << "Profile overlap infomation for base_profile: "
+     << ProfOverlap.BaseName.toString()
+     << " and test_profile: " << ProfOverlap.TestName.toString()
+     << "\nProgram level:\n";
+
+  OS << "  Whole program profile similarity: "
+     << format("%.3f%%", ProfOverlap.Similarity * 100) << "\n";
+
+  assert(ProfOverlap.UnionSample > 0 &&
+         "Total samples in two profile should be greater than 0");
+  double OverlapPercent =
+      static_cast<double>(ProfOverlap.OverlapSample) / ProfOverlap.UnionSample;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  double BaseUniquePercent = static_cast<double>(ProfOverlap.BaseUniqueSample) /
+                             ProfOverlap.BaseSample;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  double TestUniquePercent = static_cast<double>(ProfOverlap.TestUniqueSample) /
+                             ProfOverlap.TestSample;
+
+  OS << "  Whole program sample overlap: "
+     << format("%.3f%%", OverlapPercent * 100) << "\n";
+  OS << "    percentage of samples unique in base profile: "
+     << format("%.3f%%", BaseUniquePercent * 100) << "\n";
+  OS << "    percentage of samples unique in test profile: "
+     << format("%.3f%%", TestUniquePercent * 100) << "\n";
+  OS << "    total samples in base profile: " << ProfOverlap.BaseSample << "\n"
+     << "    total samples in test profile: " << ProfOverlap.TestSample << "\n";
+
+  assert(ProfOverlap.UnionCount > 0 &&
+         "There should be at least one function in two input profiles");
+  double FuncOverlapPercent =
+      static_cast<double>(ProfOverlap.OverlapCount) / ProfOverlap.UnionCount;
+  OS << "  Function overlap: " << format("%.3f%%", FuncOverlapPercent * 100)
+     << "\n";
+  OS << "    overlap functions: " << ProfOverlap.OverlapCount << "\n";
+  OS << "    functions unique in base profile: " << ProfOverlap.BaseUniqueCount
+     << "\n";
+  OS << "    functions unique in test profile: " << ProfOverlap.TestUniqueCount
+     << "\n";
+}
+
+void SampleOverlapAggregator::dumpHotFuncAndBlockOverlap(
+    raw_fd_ostream &OS) const {
+  assert(HotFuncOverlap.UnionCount > 0 &&
+         "There should be at least one hot function in two input profiles");
+  OS << "  Hot-function overlap: "
+     << format("%.3f%%", static_cast<double>(HotFuncOverlap.OverlapCount) /
+                             HotFuncOverlap.UnionCount * 100)
+     << "\n";
+  OS << "    overlap hot functions: " << HotFuncOverlap.OverlapCount << "\n";
+  OS << "    hot functions unique in base profile: "
+     << HotFuncOverlap.BaseCount - HotFuncOverlap.OverlapCount << "\n";
+  OS << "    hot functions unique in test profile: "
+     << HotFuncOverlap.TestCount - HotFuncOverlap.OverlapCount << "\n";
+
+  assert(HotBlockOverlap.UnionCount > 0 &&
+         "There should be at least one hot block in two input profiles");
+  OS << "  Hot-block overlap: "
+     << format("%.3f%%", static_cast<double>(HotBlockOverlap.OverlapCount) /
+                             HotBlockOverlap.UnionCount * 100)
+     << "\n";
+  OS << "    overlap hot blocks: " << HotBlockOverlap.OverlapCount << "\n";
+  OS << "    hot blocks unique in base profile: "
+     << HotBlockOverlap.BaseCount - HotBlockOverlap.OverlapCount << "\n";
+  OS << "    hot blocks unique in test profile: "
+     << HotBlockOverlap.TestCount - HotBlockOverlap.OverlapCount << "\n";
+}
+
+std::error_code SampleOverlapAggregator::loadProfiles() {
+  using namespace sampleprof;
+
+  LLVMContext Context;
+  auto FS = vfs::getRealFileSystem();
+  auto BaseReaderOrErr = SampleProfileReader::create(BaseFilename, Context, *FS,
+                                                     FSDiscriminatorPassOption);
+  if (std::error_code EC = BaseReaderOrErr.getError())
+    exitWithErrorCode(EC, BaseFilename);
+
+  auto TestReaderOrErr = SampleProfileReader::create(TestFilename, Context, *FS,
+                                                     FSDiscriminatorPassOption);
+  if (std::error_code EC = TestReaderOrErr.getError())
+    exitWithErrorCode(EC, TestFilename);
+
+  BaseReader = std::move(BaseReaderOrErr.get());
+  TestReader = std::move(TestReaderOrErr.get());
+
+  if (std::error_code EC = BaseReader->read())
+    exitWithErrorCode(EC, BaseFilename);
+  if (std::error_code EC = TestReader->read())
+    exitWithErrorCode(EC, TestFilename);
+  if (BaseReader->profileIsProbeBased() != TestReader->profileIsProbeBased())
+    exitWithError(
+        "cannot compare probe-based profile with non-probe-based profile");
+  if (BaseReader->profileIsCS() != TestReader->profileIsCS())
+    exitWithError("cannot compare CS profile with non-CS profile");
+
+  // Load BaseHotThreshold and TestHotThreshold as 99-percentile threshold in
+  // profile summary.
+  ProfileSummary &BasePS = BaseReader->getSummary();
+  ProfileSummary &TestPS = TestReader->getSummary();
+  BaseHotThreshold =
+      ProfileSummaryBuilder::getHotCountThreshold(BasePS.getDetailedSummary());
+  TestHotThreshold =
+      ProfileSummaryBuilder::getHotCountThreshold(TestPS.getDetailedSummary());
+
+  return std::error_code();
+}
+
+void overlapSampleProfile(const std::string &BaseFilename,
+                          const std::string &TestFilename,
+                          const OverlapFuncFilters &FuncFilter,
+                          uint64_t SimilarityCutoff, raw_fd_ostream &OS) {
+  using namespace sampleprof;
+
+  // We use 0.000005 to initialize OverlapAggr.Epsilon because the final metrics
+  // report 2--3 places after decimal point in percentage numbers.
+  SampleOverlapAggregator OverlapAggr(
+      BaseFilename, TestFilename,
+      static_cast<double>(SimilarityCutoff) / 1000000, 0.000005, FuncFilter);
+  if (std::error_code EC = OverlapAggr.loadProfiles())
+    exitWithErrorCode(EC);
+
+  OverlapAggr.initializeSampleProfileOverlap();
+  if (OverlapAggr.detectZeroSampleProfile(OS))
+    return;
+
+  OverlapAggr.computeSampleProfileOverlap(OS);
+
+  OverlapAggr.dumpProgramSummary(OS);
+  OverlapAggr.dumpHotFuncAndBlockOverlap(OS);
+  OverlapAggr.dumpFuncSimilarity(OS);
+}
+
+static int overlap_main() {
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (ProfileKind == instr)
+    overlapInstrProfile(BaseFilename, TestFilename,
+                        OverlapFuncFilters{OverlapValueCutoff, FuncNameFilter},
+                        OS, IsCS);
+  else
+    overlapSampleProfile(BaseFilename, TestFilename,
+                         OverlapFuncFilters{OverlapValueCutoff, FuncNameFilter},
+                         SimilarityCutoff, OS);
+
+  return 0;
+}
+
+namespace {
+struct ValueSitesStats {
+  ValueSitesStats() = default;
+  uint64_t TotalNumValueSites = 0;
+  uint64_t TotalNumValueSitesWithValueProfile = 0;
+  uint64_t TotalNumValues = 0;
+  std::vector<unsigned> ValueSitesHistogram;
+};
+} // namespace
+
+static void traverseAllValueSites(const InstrProfRecord &Func, uint32_t VK,
+                                  ValueSitesStats &Stats, raw_fd_ostream &OS,
+                                  InstrProfSymtab *Symtab) {
+  uint32_t NS = Func.getNumValueSites(VK);
+  Stats.TotalNumValueSites += NS;
+  for (size_t I = 0; I < NS; ++I) {
+    auto VD = Func.getValueArrayForSite(VK, I);
+    uint32_t NV = VD.size();
+    if (NV == 0)
+      continue;
+    Stats.TotalNumValues += NV;
+    Stats.TotalNumValueSitesWithValueProfile++;
+    if (NV > Stats.ValueSitesHistogram.size())
+      Stats.ValueSitesHistogram.resize(NV, 0);
+    Stats.ValueSitesHistogram[NV - 1]++;
+
+    uint64_t SiteSum = 0;
+    for (const auto &V : VD)
+      SiteSum += V.Count;
+    if (SiteSum == 0)
+      SiteSum = 1;
+
+    for (const auto &V : VD) {
+      OS << "\t[ " << format("%2u", I) << ", ";
+      if (Symtab == nullptr)
+        OS << format("%4" PRIu64, V.Value);
+      else
+        OS << Symtab->getFuncOrVarName(V.Value);
+      OS << ", " << format("%10" PRId64, V.Count) << " ] ("
+         << format("%.2f%%", (V.Count * 100.0 / SiteSum)) << ")\n";
+    }
+  }
+}
+
+static void showValueSitesStats(raw_fd_ostream &OS, uint32_t VK,
+                                ValueSitesStats &Stats) {
+  OS << "  Total number of sites: " << Stats.TotalNumValueSites << "\n";
+  OS << "  Total number of sites with values: "
+     << Stats.TotalNumValueSitesWithValueProfile << "\n";
+  OS << "  Total number of profiled values: " << Stats.TotalNumValues << "\n";
+
+  OS << "  Value sites histogram:\n\tNumTargets, SiteCount\n";
+  for (unsigned I = 0; I < Stats.ValueSitesHistogram.size(); I++) {
+    if (Stats.ValueSitesHistogram[I] > 0)
+      OS << "\t" << I + 1 << ", " << Stats.ValueSitesHistogram[I] << "\n";
+  }
+}
+
+static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
+  if (SFormat == ShowFormat::Json)
+    exitWithError("JSON output is not supported for instr profiles");
+  if (SFormat == ShowFormat::Yaml)
+    exitWithError("YAML output is not supported for instr profiles");
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
+  std::vector<uint32_t> Cutoffs = std::move(DetailedSummaryCutoffs);
+  if (ShowDetailedSummary && Cutoffs.empty()) {
+    Cutoffs = ProfileSummaryBuilder::DefaultCutoffs;
+  }
+  InstrProfSummaryBuilder Builder(std::move(Cutoffs));
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  bool IsIRInstr = Reader->isIRLevelProfile();
+  size_t ShownFunctions = 0;
+  size_t BelowCutoffFunctions = 0;
+  int NumVPKind = IPVK_Last - IPVK_First + 1;
+  std::vector<ValueSitesStats> VPStats(NumVPKind);
+
+  auto MinCmp = [](const std::pair<std::string, uint64_t> &v1,
+                   const std::pair<std::string, uint64_t> &v2) {
+    return v1.second > v2.second;
+  };
+
+  std::priority_queue<std::pair<std::string, uint64_t>,
+                      std::vector<std::pair<std::string, uint64_t>>,
+                      decltype(MinCmp)>
+      HottestFuncs(MinCmp);
+
+  if (!TextFormat && OnlyListBelow) {
+    OS << "The list of functions with the maximum counter less than "
+       << ShowValueCutoff << ":\n";
+  }
+
+  // Add marker so that IR-level instrumentation round-trips properly.
+  if (TextFormat && IsIRInstr)
+    OS << ":ir\n";
+
+  for (const auto &Func : *Reader) {
+    if (Reader->isIRLevelProfile()) {
+      bool FuncIsCS = NamedInstrProfRecord::hasCSFlagInHash(Func.Hash);
+      if (FuncIsCS != ShowCS)
+        continue;
+    }
+    bool Show = ShowAllFunctions ||
+                (!FuncNameFilter.empty() && Func.Name.contains(FuncNameFilter));
+
+    bool doTextFormatDump = (Show && TextFormat);
+
+    if (doTextFormatDump) {
+      InstrProfSymtab &Symtab = Reader->getSymtab();
+      InstrProfWriter::writeRecordInText(Func.Name, Func.Hash, Func, Symtab,
+                                         OS);
+      continue;
+    }
+
+    assert(Func.Counts.size() > 0 && "function missing entry counter");
+    Builder.addRecord(Func);
+
+    if (ShowCovered) {
+      if (llvm::any_of(Func.Counts, [](uint64_t C) { return C; }))
+        OS << Func.Name << "\n";
+      continue;
+    }
+
+    uint64_t FuncMax = 0;
+    uint64_t FuncSum = 0;
+
+    auto PseudoKind = Func.getCountPseudoKind();
+    if (PseudoKind != InstrProfRecord::NotPseudo) {
+      if (Show) {
+        if (!ShownFunctions)
+          OS << "Counters:\n";
+        ++ShownFunctions;
+        OS << "  " << Func.Name << ":\n"
+           << "    Hash: " << format("0x%016" PRIx64, Func.Hash) << "\n"
+           << "    Counters: " << Func.Counts.size();
+        if (PseudoKind == InstrProfRecord::PseudoHot)
+          OS << "    <PseudoHot>\n";
+        else if (PseudoKind == InstrProfRecord::PseudoWarm)
+          OS << "    <PseudoWarm>\n";
+        else
+          llvm_unreachable("Unknown PseudoKind");
+      }
+      continue;
+    }
+
+    for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) {
+      FuncMax = std::max(FuncMax, Func.Counts[I]);
+      FuncSum += Func.Counts[I];
+    }
+
+    if (FuncMax < ShowValueCutoff) {
+      ++BelowCutoffFunctions;
+      if (OnlyListBelow) {
+        OS << "  " << Func.Name << ": (Max = " << FuncMax
+           << " Sum = " << FuncSum << ")\n";
+      }
+      continue;
+    } else if (OnlyListBelow)
+      continue;
+
+    if (TopNFunctions) {
+      if (HottestFuncs.size() == TopNFunctions) {
+        if (HottestFuncs.top().second < FuncMax) {
+          HottestFuncs.pop();
+          HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax));
+        }
+      } else
+        HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax));
+    }
+
+    if (Show) {
+      if (!ShownFunctions)
+        OS << "Counters:\n";
+
+      ++ShownFunctions;
+
+      OS << "  " << Func.Name << ":\n"
+         << "    Hash: " << format("0x%016" PRIx64, Func.Hash) << "\n"
+         << "    Counters: " << Func.Counts.size() << "\n";
+      if (!IsIRInstr)
+        OS << "    Function count: " << Func.Counts[0] << "\n";
+
+      if (ShowIndirectCallTargets)
+        OS << "    Indirect Call Site Count: "
+           << Func.getNumValueSites(IPVK_IndirectCallTarget) << "\n";
+
+      if (ShowVTables)
+        OS << "    Number of instrumented vtables: "
+           << Func.getNumValueSites(IPVK_VTableTarget) << "\n";
+
+      uint32_t NumMemOPCalls = Func.getNumValueSites(IPVK_MemOPSize);
+      if (ShowMemOPSizes && NumMemOPCalls > 0)
+        OS << "    Number of Memory Intrinsics Calls: " << NumMemOPCalls
+           << "\n";
+
+      if (ShowCounts) {
+        OS << "    Block counts: [";
+        size_t Start = (IsIRInstr ? 0 : 1);
+        for (size_t I = Start, E = Func.Counts.size(); I < E; ++I) {
+          OS << (I == Start ? "" : ", ") << Func.Counts[I];
+        }
+        OS << "]\n";
+      }
+
+      if (ShowIndirectCallTargets) {
+        OS << "    Indirect Target Results:\n";
+        traverseAllValueSites(Func, IPVK_IndirectCallTarget,
+                              VPStats[IPVK_IndirectCallTarget], OS,
+                              &(Reader->getSymtab()));
+      }
+
+      if (ShowVTables) {
+        OS << "    VTable Results:\n";
+        traverseAllValueSites(Func, IPVK_VTableTarget,
+                              VPStats[IPVK_VTableTarget], OS,
+                              &(Reader->getSymtab()));
+      }
+
+      if (ShowMemOPSizes && NumMemOPCalls > 0) {
+        OS << "    Memory Intrinsic Size Results:\n";
+        traverseAllValueSites(Func, IPVK_MemOPSize, VPStats[IPVK_MemOPSize], OS,
+                              nullptr);
+      }
+    }
+  }
+  if (Reader->hasError())
+    exitWithError(Reader->getError(), Filename);
+
+  if (TextFormat || ShowCovered)
+    return 0;
+  std::unique_ptr<ProfileSummary> PS(Builder.getSummary());
+  bool IsIR = Reader->isIRLevelProfile();
+  OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end");
+  if (IsIR) {
+    OS << "  entry_first = " << Reader->instrEntryBBEnabled();
+    OS << "  instrument_loop_entries = " << Reader->instrLoopEntriesEnabled();
+  }
+  OS << "\n";
+  if (ShowAllFunctions || !FuncNameFilter.empty())
+    OS << "Functions shown: " << ShownFunctions << "\n";
+  PS->printSummary(OS);
+  if (ShowValueCutoff > 0) {
+    OS << "Number of functions with maximum count (< " << ShowValueCutoff
+       << "): " << BelowCutoffFunctions << "\n";
+    OS << "Number of functions with maximum count (>= " << ShowValueCutoff
+       << "): " << PS->getNumFunctions() - BelowCutoffFunctions << "\n";
+  }
+
+  if (TopNFunctions) {
+    std::vector<std::pair<std::string, uint64_t>> SortedHottestFuncs;
+    while (!HottestFuncs.empty()) {
+      SortedHottestFuncs.emplace_back(HottestFuncs.top());
+      HottestFuncs.pop();
+    }
+    OS << "Top " << TopNFunctions
+       << " functions with the largest internal block counts: \n";
+    for (auto &hotfunc : llvm::reverse(SortedHottestFuncs))
+      OS << "  " << hotfunc.first << ", max count = " << hotfunc.second << "\n";
+  }
+
+  if (ShownFunctions && ShowIndirectCallTargets) {
+    OS << "Statistics for indirect call sites profile:\n";
+    showValueSitesStats(OS, IPVK_IndirectCallTarget,
+                        VPStats[IPVK_IndirectCallTarget]);
+  }
+
+  if (ShownFunctions && ShowVTables) {
+    OS << "Statistics for vtable profile:\n";
+    showValueSitesStats(OS, IPVK_VTableTarget, VPStats[IPVK_VTableTarget]);
+  }
+
+  if (ShownFunctions && ShowMemOPSizes) {
+    OS << "Statistics for memory intrinsic calls sizes profile:\n";
+    showValueSitesStats(OS, IPVK_MemOPSize, VPStats[IPVK_MemOPSize]);
+  }
+
+  if (ShowDetailedSummary)
+    PS->printDetailedSummary(OS);
+
+  if (ShowBinaryIds)
+    if (Error E = Reader->printBinaryIds(OS))
+      exitWithError(std::move(E), Filename);
+
+  if (ShowProfileVersion)
+    OS << "Profile version: " << Reader->getVersion() << "\n";
+
+  if (ShowTemporalProfTraces) {
+    auto &Traces = Reader->getTemporalProfTraces();
+    OS << "Temporal Profile Traces (samples=" << Traces.size()
+       << " seen=" << Reader->getTemporalProfTraceStreamSize() << "):\n";
+    for (unsigned i = 0; i < Traces.size(); i++) {
+      OS << "  Temporal Profile Trace " << i << " (weight=" << Traces[i].Weight
+         << " count=" << Traces[i].FunctionNameRefs.size() << "):\n";
+      for (auto &NameRef : Traces[i].FunctionNameRefs)
+        OS << "    " << Reader->getSymtab().getFuncOrVarName(NameRef) << "\n";
+    }
+  }
+
+  return 0;
+}
+
+static void showSectionInfo(sampleprof::SampleProfileReader *Reader,
+                            raw_fd_ostream &OS) {
+  if (!Reader->dumpSectionInfo(OS)) {
+    WithColor::warning() << "-show-sec-info-only is only supported for "
+                         << "sample profile in extbinary format and is "
+                         << "ignored for other formats.\n";
+    return;
+  }
+}
+
+namespace {
+struct HotFuncInfo {
+  std::string FuncName;
+  uint64_t TotalCount = 0;
+  double TotalCountPercent = 0.0f;
+  uint64_t MaxCount = 0;
+  uint64_t EntryCount = 0;
+
+  HotFuncInfo() = default;
+
+  HotFuncInfo(StringRef FN, uint64_t TS, double TSP, uint64_t MS, uint64_t ES)
+      : FuncName(FN.begin(), FN.end()), TotalCount(TS), TotalCountPercent(TSP),
+        MaxCount(MS), EntryCount(ES) {}
+};
+} // namespace
+
+// Print out detailed information about hot functions in PrintValues vector.
+// Users specify titles and offset of every columns through ColumnTitle and
+// ColumnOffset. The size of ColumnTitle and ColumnOffset need to be the same
+// and at least 4. Besides, users can optionally give a HotFuncMetric string to
+// print out or let it be an empty string.
+static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
+                                const std::vector<int> &ColumnOffset,
+                                const std::vector<HotFuncInfo> &PrintValues,
+                                uint64_t HotFuncCount, uint64_t TotalFuncCount,
+                                uint64_t HotProfCount, uint64_t TotalProfCount,
+                                const std::string &HotFuncMetric,
+                                uint32_t TopNFunctions, raw_fd_ostream &OS) {
+  assert(ColumnOffset.size() == ColumnTitle.size() &&
+         "ColumnOffset and ColumnTitle should have the same size");
+  assert(ColumnTitle.size() >= 4 &&
+         "ColumnTitle should have at least 4 elements");
+  assert(TotalFuncCount > 0 &&
+         "There should be at least one function in the profile");
+  double TotalProfPercent = 0;
+  if (TotalProfCount > 0)
+    TotalProfPercent = static_cast<double>(HotProfCount) / TotalProfCount * 100;
+
+  formatted_raw_ostream FOS(OS);
+  FOS << HotFuncCount << " out of " << TotalFuncCount
+      << " functions with profile ("
+      << format("%.2f%%",
+                (static_cast<double>(HotFuncCount) / TotalFuncCount * 100))
+      << ") are considered hot functions";
+  if (!HotFuncMetric.empty())
+    FOS << " (" << HotFuncMetric << ")";
+  FOS << ".\n";
+  FOS << HotProfCount << " out of " << TotalProfCount << " profile counts ("
+      << format("%.2f%%", TotalProfPercent) << ") are from hot functions.\n";
+
+  for (size_t I = 0; I < ColumnTitle.size(); ++I) {
+    FOS.PadToColumn(ColumnOffset[I]);
+    FOS << ColumnTitle[I];
+  }
+  FOS << "\n";
+
+  uint32_t Count = 0;
+  for (const auto &R : PrintValues) {
+    if (TopNFunctions && (Count++ == TopNFunctions))
+      break;
+    FOS.PadToColumn(ColumnOffset[0]);
+    FOS << R.TotalCount << " (" << format("%.2f%%", R.TotalCountPercent) << ")";
+    FOS.PadToColumn(ColumnOffset[1]);
+    FOS << R.MaxCount;
+    FOS.PadToColumn(ColumnOffset[2]);
+    FOS << R.EntryCount;
+    FOS.PadToColumn(ColumnOffset[3]);
+    FOS << R.FuncName << "\n";
+  }
+}
+
+static int showHotFunctionList(const sampleprof::SampleProfileMap &Profiles,
+                               ProfileSummary &PS, uint32_t TopN,
+                               raw_fd_ostream &OS) {
+  using namespace sampleprof;
+
+  const uint32_t HotFuncCutoff = 990000;
+  auto &SummaryVector = PS.getDetailedSummary();
+  uint64_t MinCountThreshold = 0;
+  for (const ProfileSummaryEntry &SummaryEntry : SummaryVector) {
+    if (SummaryEntry.Cutoff == HotFuncCutoff) {
+      MinCountThreshold = SummaryEntry.MinCount;
+      break;
+    }
+  }
+
+  // Traverse all functions in the profile and keep only hot functions.
+  // The following loop also calculates the sum of total samples of all
+  // functions.
+  std::multimap<uint64_t, std::pair<const FunctionSamples *, const uint64_t>,
+                std::greater<uint64_t>>
+      HotFunc;
+  uint64_t ProfileTotalSample = 0;
+  uint64_t HotFuncSample = 0;
+  uint64_t HotFuncCount = 0;
+
+  for (const auto &I : Profiles) {
+    FuncSampleStats FuncStats;
+    const FunctionSamples &FuncProf = I.second;
+    ProfileTotalSample += FuncProf.getTotalSamples();
+    getFuncSampleStats(FuncProf, FuncStats, MinCountThreshold);
+
+    if (isFunctionHot(FuncStats, MinCountThreshold)) {
+      HotFunc.emplace(FuncProf.getTotalSamples(),
+                      std::make_pair(&(I.second), FuncStats.MaxSample));
+      HotFuncSample += FuncProf.getTotalSamples();
+      ++HotFuncCount;
+    }
+  }
+
+  std::vector<std::string> ColumnTitle{"Total sample (%)", "Max sample",
+                                       "Entry sample", "Function name"};
+  std::vector<int> ColumnOffset{0, 24, 42, 58};
+  std::string Metric =
+      std::string("max sample >= ") + std::to_string(MinCountThreshold);
+  std::vector<HotFuncInfo> PrintValues;
+  for (const auto &FuncPair : HotFunc) {
+    const FunctionSamples &Func = *FuncPair.second.first;
+    double TotalSamplePercent =
+        (ProfileTotalSample > 0)
+            ? (Func.getTotalSamples() * 100.0) / ProfileTotalSample
+            : 0;
+    PrintValues.emplace_back(
+        HotFuncInfo(Func.getContext().toString(), Func.getTotalSamples(),
+                    TotalSamplePercent, FuncPair.second.second,
+                    Func.getHeadSamplesEstimate()));
+  }
+  dumpHotFunctionList(ColumnTitle, ColumnOffset, PrintValues, HotFuncCount,
+                      Profiles.size(), HotFuncSample, ProfileTotalSample,
+                      Metric, TopN, OS);
+
+  return 0;
+}
+
+static int showSampleProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
+  if (SFormat == ShowFormat::Yaml)
+    exitWithError("YAML output is not supported for sample profiles");
+  using namespace sampleprof;
+  LLVMContext Context;
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = SampleProfileReader::create(Filename, Context, *FS,
+                                                 FSDiscriminatorPassOption);
+  if (std::error_code EC = ReaderOrErr.getError())
+    exitWithErrorCode(EC, Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  if (ShowSectionInfoOnly) {
+    showSectionInfo(Reader.get(), OS);
+    return 0;
+  }
+
+  if (std::error_code EC = Reader->read())
+    exitWithErrorCode(EC, Filename);
+
+  if (ShowAllFunctions || FuncNameFilter.empty()) {
+    if (SFormat == ShowFormat::Json)
+      Reader->dumpJson(OS);
+    else
+      Reader->dump(OS);
+  } else {
+    if (SFormat == ShowFormat::Json)
+      exitWithError(
+          "the JSON format is supported only when all functions are to "
+          "be printed");
+
+    // TODO: parse context string to support filtering by contexts.
+    FunctionSamples *FS = Reader->getSamplesFor(StringRef(FuncNameFilter));
+    Reader->dumpFunctionProfile(FS ? *FS : FunctionSamples(), OS);
+  }
+
+  if (ShowProfileSymbolList) {
+    std::unique_ptr<sampleprof::ProfileSymbolList> ReaderList =
+        Reader->getProfileSymbolList();
+    ReaderList->dump(OS);
+  }
+
+  if (ShowDetailedSummary) {
+    auto &PS = Reader->getSummary();
+    PS.printSummary(OS);
+    PS.printDetailedSummary(OS);
+  }
+
+  if (ShowHotFuncList || TopNFunctions)
+    showHotFunctionList(Reader->getProfiles(), Reader->getSummary(),
+                        TopNFunctions, OS);
+
+  return 0;
+}
+
+static int showMemProfProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
+  if (SFormat == ShowFormat::Json)
+    exitWithError("JSON output is not supported for MemProf");
+
+  // Show the raw profile in YAML.
+  if (memprof::RawMemProfReader::hasFormat(Filename)) {
+    auto ReaderOr = llvm::memprof::RawMemProfReader::create(
+        Filename, ProfiledBinary, /*KeepNames=*/true);
+    if (Error E = ReaderOr.takeError()) {
+      // Since the error can be related to the profile or the binary we do not
+      // pass whence. Instead additional context is provided where necessary in
+      // the error message.
+      exitWithError(std::move(E), /*Whence*/ "");
+    }
+
+    std::unique_ptr<llvm::memprof::RawMemProfReader> Reader(
+        ReaderOr.get().release());
+
+    Reader->printYAML(OS);
+    return 0;
+  }
+
+  // Show the indexed MemProf profile in YAML.
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = IndexedInstrProfReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  memprof::AllMemProfData Data = Reader->getAllMemProfData();
+
+  // For v4 and above the summary is serialized in the indexed profile, and can
+  // be accessed from the reader. Earlier versions build the summary below.
+  // The summary is emitted as YAML comments at the start of the output.
+  if (auto *MemProfSum = Reader->getMemProfSummary()) {
+    MemProfSum->printSummaryYaml(OS);
+  } else {
+    memprof::MemProfSummaryBuilder MemProfSumBuilder;
+    for (auto &Pair : Data.HeapProfileRecords)
+      MemProfSumBuilder.addRecord(Pair.Record);
+    MemProfSumBuilder.getSummary()->printSummaryYaml(OS);
+  }
+  // Construct yaml::Output with the maximum column width of 80 so that each
+  // Frame fits in one line.
+  yaml::Output Yout(OS, nullptr, 80);
+  Yout << Data;
+
+  return 0;
+}
+
+static int showDebugInfoCorrelation(const std::string &Filename,
+                                    ShowFormat SFormat, raw_fd_ostream &OS) {
+  if (SFormat == ShowFormat::Json)
+    exitWithError("JSON output is not supported for debug info correlation");
+  std::unique_ptr<InstrProfCorrelator> Correlator;
+  if (auto Err =
+          InstrProfCorrelator::get(Filename, InstrProfCorrelator::DEBUG_INFO)
+              .moveInto(Correlator))
+    exitWithError(std::move(Err), Filename);
+  if (SFormat == ShowFormat::Yaml) {
+    if (auto Err = Correlator->dumpYaml(MaxDbgCorrelationWarnings, OS))
+      exitWithError(std::move(Err), Filename);
+    return 0;
+  }
+
+  if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings))
+    exitWithError(std::move(Err), Filename);
+
+  InstrProfSymtab Symtab;
+  if (auto Err = Symtab.create(
+          StringRef(Correlator->getNamesPointer(), Correlator->getNamesSize())))
+    exitWithError(std::move(Err), Filename);
+
+  if (ShowProfileSymbolList)
+    Symtab.dumpNames(OS);
+  // TODO: Read "Profile Data Type" from debug info to compute and show how many
+  // counters the section holds.
+  if (ShowDetailedSummary)
+    OS << "Counters section size: 0x"
+       << Twine::utohexstr(Correlator->getCountersSectionSize()) << " bytes\n";
+  OS << "Found " << Correlator->getDataSize() << " functions\n";
+
+  return 0;
+}
+
+static int show_main(StringRef ProgName) {
+  if (Filename.empty() && DebugInfoFilename.empty())
+    exitWithError(
+        "the positional argument '<profdata-file>' is required unless '--" +
+        DebugInfoFilename.ArgStr + "' is provided");
+
+  if (Filename == OutputFilename) {
+    errs() << ProgName
+           << " show: Input file name cannot be the same as the output file "
+              "name!\n";
+    return 1;
+  }
+  if (JsonFormat)
+    SFormat = ShowFormat::Json;
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (ShowAllFunctions && !FuncNameFilter.empty())
+    WithColor::warning() << "-function argument ignored: showing all functions\n";
+
+  if (!DebugInfoFilename.empty())
+    return showDebugInfoCorrelation(DebugInfoFilename, SFormat, OS);
+
+  if (ShowProfileKind == instr)
+    return showInstrProfile(SFormat, OS);
+  if (ShowProfileKind == sample)
+    return showSampleProfile(SFormat, OS);
+  return showMemProfProfile(SFormat, OS);
+}
+
+static int order_main() {
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  for (auto &I : *Reader) {
+    // Read all entries
+    (void)I;
+  }
+  ArrayRef Traces = Reader->getTemporalProfTraces();
+  if (NumTestTraces && NumTestTraces >= Traces.size())
+    exitWithError(
+        "--" + NumTestTraces.ArgStr +
+        " must be smaller than the total number of traces: expected: < " +
+        Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
+  ArrayRef TestTraces = Traces.take_back(NumTestTraces);
+  Traces = Traces.drop_back(NumTestTraces);
+
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
+  BalancedPartitioningConfig Config;
+  BalancedPartitioning BP(Config);
+  BP.run(Nodes);
+
+  OS << "# Ordered " << Nodes.size() << " functions\n";
+  if (!TestTraces.empty()) {
+    // Since we don't know the symbol sizes, we assume 32 functions per page.
+    DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
+    for (auto &Node : Nodes)
+      IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
+
+    SmallSet<unsigned, 0> TouchedPages;
+    unsigned Area = 0;
+    for (auto &Trace : TestTraces) {
+      for (auto Id : Trace.FunctionNameRefs) {
+        auto It = IdToPageNumber.find(Id);
+        if (It == IdToPageNumber.end())
+          continue;
+        TouchedPages.insert(It->getSecond());
+        Area += TouchedPages.size();
+      }
+      TouchedPages.clear();
+    }
+    OS << "# Total area under the page fault curve: " << (float)Area << "\n";
+  }
+  OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
+        "linkage and this output does not take that into account. Some "
+        "post-processing may be required before passing to the linker via "
+        "-order_file.\n";
+  for (auto &N : Nodes) {
+    auto [Filename, ParsedFuncName] =
+        getParsedIRPGOName(Reader->getSymtab().getFuncOrVarName(N.Id));
+    if (!Filename.empty())
+      OS << "# " << Filename << "\n";
+    OS << ParsedFuncName << "\n";
+  }
+  return 0;
+}
+
+int llvm_profdata_main(int argc, char **argvNonConst,
+                       const llvm::ToolContext &) {
+  const char **argv = const_cast<const char **>(argvNonConst);
+
+  StringRef ProgName(sys::path::filename(argv[0]));
+
+  if (argc < 2) {
+    errs()
+        << ProgName
+        << ": No subcommand specified! Run llvm-profdata --help for usage.\n";
+    return 1;
+  }
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data\n");
+
+  if (ShowSubcommand)
+    return show_main(ProgName);
+
+  if (OrderSubcommand)
+    return order_main();
+
+  if (OverlapSubcommand)
+    return overlap_main();
+
+  if (MergeSubcommand)
+    return merge_main(ProgName);
+
+  errs() << ProgName
+         << ": Unknown command. Run llvm-profdata --help for usage.\n";
+  return 1;
+}
+
+// LDC manually added `main` function, which is generated by CMake in LLVM's build. See LLVM's llvm-driver-template.cpp.in
+#include "llvm/Support/InitLLVM.h"
+int main(int argc, char **argv) {
+  llvm::InitLLVM X(argc, argv);
+  return llvm_profdata_main(argc, argv, {argv[0], nullptr, false});
+}
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-21.1/CMakeLists.txt
new file mode 100644
index 0000000000..354c63f409
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+set(LLVM_LINK_COMPONENTS
+  AllTargetsDescs
+  AllTargetsDisassemblers
+  AllTargetsInfos
+  DebugInfoDWARF
+  Core
+  MC
+  IPO
+  MCDisassembler
+  Object
+  ProfileData
+  Support
+  Symbolize
+  TargetParser
+  )
+
+add_llvm_tool(llvm-profgen
+  llvm-profgen.cpp
+  PerfReader.cpp
+  CSPreInliner.cpp
+  ProfiledBinary.cpp
+  ProfileGenerator.cpp
+  MissingFrameInferrer.cpp
+  )
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.cpp
new file mode 100644
index 0000000000..87df6996aa
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.cpp
@@ -0,0 +1,316 @@
+//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSPreInliner.h"
+#include "ProfiledBinary.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+#include <cstdint>
+#include <queue>
+
+#define DEBUG_TYPE "cs-preinliner"
+
+using namespace llvm;
+using namespace sampleprof;
+
+STATISTIC(PreInlNumCSInlined,
+          "Number of functions inlined with context sensitive profile");
+STATISTIC(PreInlNumCSNotInlined,
+          "Number of functions not inlined with context sensitive profile");
+STATISTIC(PreInlNumCSInlinedHitMinLimit,
+          "Number of functions with FDO inline stopped due to min size limit");
+STATISTIC(PreInlNumCSInlinedHitMaxLimit,
+          "Number of functions with FDO inline stopped due to max size limit");
+STATISTIC(
+    PreInlNumCSInlinedHitGrowthLimit,
+    "Number of functions with FDO inline stopped due to growth size limit");
+
+// The switches specify inline thresholds used in SampleProfileLoader inlining.
+// TODO: the actual threshold to be tuned here because the size here is based
+// on machine code not LLVM IR.
+namespace llvm {
+cl::opt<bool> EnableCSPreInliner(
+    "csspgo-preinliner", cl::Hidden, cl::init(true),
+    cl::desc("Run a global pre-inliner to merge context profile based on "
+             "estimated global top-down inline decisions"));
+
+cl::opt<bool> UseContextCostForPreInliner(
+    "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
+    cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
+} // namespace llvm
+
+static cl::opt<bool> SamplePreInlineReplay(
+    "csspgo-replay-preinline", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Replay previous inlining and adjust context profile accordingly"));
+
+static cl::opt<int> CSPreinlMultiplierForPrevInl(
+    "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
+    cl::init(100),
+    cl::desc(
+        "Multiplier to bump up callsite threshold for previous inlining."));
+
+CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
+                           ProfiledBinary &Binary, ProfileSummary *Summary)
+    : UseContextCost(UseContextCostForPreInliner),
+      // TODO: Pass in a guid-to-name map in order for
+      // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
+      // as their profile context.
+      ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
+  // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
+  // for good performance with reasonable profile size.
+  if (!SampleHotCallSiteThreshold.getNumOccurrences())
+    SampleHotCallSiteThreshold = 1500;
+  if (!SampleColdCallSiteThreshold.getNumOccurrences())
+    SampleColdCallSiteThreshold = 0;
+  if (!ProfileInlineLimitMax.getNumOccurrences())
+    ProfileInlineLimitMax = 50000;
+}
+
+std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
+  std::vector<FunctionId> Order;
+  // Trim cold edges to get a more stable call graph. This allows for a more
+  // stable top-down order which in turns helps the stablity of the generated
+  // profile from run to run.
+  uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
+      (Summary->getDetailedSummary()));
+  ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
+
+  // Now that we have a profiled call graph, construct top-down order
+  // by building up SCC and reversing SCC order.
+  scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
+  while (!I.isAtEnd()) {
+    auto Range = *I;
+    if (SortProfiledSCC) {
+      // Sort nodes in one SCC based on callsite hotness.
+      scc_member_iterator<ProfiledCallGraph *> SI(*I);
+      Range = *SI;
+    }
+    for (auto *Node : Range) {
+      if (Node != ProfiledCG.getEntryNode())
+        Order.push_back(Node->Name);
+    }
+    ++I;
+  }
+  std::reverse(Order.begin(), Order.end());
+
+  return Order;
+}
+
+bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
+                                       const FunctionSamples *CallerSamples) {
+  assert(CallerSamples && "Expect non-null caller samples");
+
+  // Ideally we want to consider everything a function calls, but as far as
+  // context profile is concerned, only those frames that are children of
+  // current one in the trie is relavent. So we walk the trie instead of call
+  // targets from function profile.
+  ContextTrieNode *CallerNode =
+      ContextTracker.getContextNodeForProfile(CallerSamples);
+
+  bool HasNewCandidate = false;
+  for (auto &Child : CallerNode->getAllChildContext()) {
+    ContextTrieNode *CalleeNode = &Child.second;
+    FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
+    if (!CalleeSamples)
+      continue;
+
+    // Call site count is more reliable, so we look up the corresponding call
+    // target profile in caller's context profile to retrieve call site count.
+    uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
+    uint64_t CallsiteCount = 0;
+    LineLocation Callsite = CalleeNode->getCallSiteLoc();
+    if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
+      auto It = CallTargets->find(CalleeSamples->getFunction());
+      if (It != CallTargets->end())
+        CallsiteCount = It->second;
+    }
+
+    // TODO: call site and callee entry count should be mostly consistent, add
+    // check for that.
+    HasNewCandidate = true;
+    uint32_t CalleeSize = getFuncSize(CalleeNode);
+    CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
+                   CalleeSize);
+  }
+
+  return HasNewCandidate;
+}
+
+uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
+  if (UseContextCost)
+    return Binary.getFuncSizeForContext(ContextNode);
+
+  return ContextNode->getFunctionSamples()->getBodySamples().size();
+}
+
+bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
+  bool WasInlined =
+      Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
+  // If replay inline is requested, simply follow the inline decision of the
+  // profiled binary.
+  if (SamplePreInlineReplay)
+    return WasInlined;
+
+  unsigned int SampleThreshold = SampleColdCallSiteThreshold;
+  uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
+      (Summary->getDetailedSummary()));
+
+  if (Candidate.CallsiteCount <= ColdCountThreshold)
+    SampleThreshold = SampleColdCallSiteThreshold;
+  else {
+    // Linearly adjust threshold based on normalized hotness, i.e, a value in
+    // [0,1]. Use 10% cutoff instead of the max count as the normalization
+    // upperbound for stability.
+    double NormalizationUpperBound =
+        ProfileSummaryBuilder::getEntryForPercentile(
+            Summary->getDetailedSummary(), 100000 /* 10% */)
+            .MinCount;
+    double NormalizationLowerBound = ColdCountThreshold;
+    double NormalizedHotness =
+        (Candidate.CallsiteCount - NormalizationLowerBound) /
+        (NormalizationUpperBound - NormalizationLowerBound);
+    if (NormalizedHotness > 1.0)
+      NormalizedHotness = 1.0;
+    // Add 1 to ensure hot callsites get a non-zero threshold, which could
+    // happen when SampleColdCallSiteThreshold is 0. This is when we do not
+    // want any inlining for cold callsites.
+    SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
+                      SampleColdCallSiteThreshold + 1;
+    // Bump up the threshold to favor previous compiler inline decision. The
+    // compiler has more insight and knowledge about functions based on their IR
+    // and attribures and should be able to make a more reasonable inline
+    // decision.
+    if (WasInlined)
+      SampleThreshold *= CSPreinlMultiplierForPrevInl;
+  }
+
+  return (Candidate.SizeCost < SampleThreshold);
+}
+
+void CSPreInliner::processFunction(const FunctionId Name) {
+  FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
+  if (!FSamples)
+    return;
+
+  unsigned FuncSize =
+      getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
+  unsigned FuncFinalSize = FuncSize;
+  unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
+  SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+  SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+
+  LLVM_DEBUG(dbgs() << "Process " << Name
+                    << " for context-sensitive pre-inlining (pre-inline size: "
+                    << FuncSize << ", size limit: " << SizeLimit << ")\n");
+
+  ProfiledCandidateQueue CQueue;
+  getInlineCandidates(CQueue, FSamples);
+
+  while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
+    ProfiledInlineCandidate Candidate = CQueue.top();
+    CQueue.pop();
+    bool ShouldInline = false;
+    if ((ShouldInline = shouldInline(Candidate))) {
+      // We mark context as inlined as the corresponding context profile
+      // won't be merged into that function's base profile.
+      ++PreInlNumCSInlined;
+      ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
+      Candidate.CalleeSamples->getContext().setAttribute(
+          ContextShouldBeInlined);
+      FuncFinalSize += Candidate.SizeCost;
+      getInlineCandidates(CQueue, Candidate.CalleeSamples);
+    } else {
+      ++PreInlNumCSNotInlined;
+    }
+    LLVM_DEBUG(
+        dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
+               << " context profile for: "
+               << ContextTracker.getContextString(*Candidate.CalleeSamples)
+               << " (callee size: " << Candidate.SizeCost
+               << ", call count:" << Candidate.CallsiteCount << ")\n");
+  }
+
+  if (!CQueue.empty()) {
+    if (SizeLimit == (unsigned)ProfileInlineLimitMax)
+      ++PreInlNumCSInlinedHitMaxLimit;
+    else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
+      ++PreInlNumCSInlinedHitMinLimit;
+    else
+      ++PreInlNumCSInlinedHitGrowthLimit;
+  }
+
+  LLVM_DEBUG({
+    if (!CQueue.empty())
+      dbgs() << "  Inline candidates ignored due to size limit (inliner "
+                "original size: "
+             << FuncSize << ", inliner final size: " << FuncFinalSize
+             << ", size limit: " << SizeLimit << ")\n";
+
+    while (!CQueue.empty()) {
+      ProfiledInlineCandidate Candidate = CQueue.top();
+      CQueue.pop();
+      bool WasInlined =
+          Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
+      dbgs() << "    "
+             << ContextTracker.getContextString(*Candidate.CalleeSamples)
+             << " (candidate size:" << Candidate.SizeCost
+             << ", call count: " << Candidate.CallsiteCount << ", previously "
+             << (WasInlined ? "inlined)\n" : "not inlined)\n");
+    }
+  });
+}
+
+void CSPreInliner::run() {
+#ifndef NDEBUG
+  auto printProfileNames = [](SampleContextTracker &ContextTracker,
+                              bool IsInput) {
+    uint32_t Size = 0;
+    for (auto *Node : ContextTracker) {
+      FunctionSamples *FSamples = Node->getFunctionSamples();
+      if (FSamples) {
+        Size++;
+        dbgs() << "  [" << ContextTracker.getContextString(Node) << "] "
+               << FSamples->getTotalSamples() << ":"
+               << FSamples->getHeadSamples() << "\n";
+      }
+    }
+    dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
+           << Size << " total):\n";
+  };
+#endif
+
+  LLVM_DEBUG(printProfileNames(ContextTracker, true));
+
+  // Execute global pre-inliner to estimate a global top-down inline
+  // decision and merge profiles accordingly. This helps with profile
+  // merge for ThinLTO otherwise we won't be able to merge profiles back
+  // to base profile across module/thin-backend boundaries.
+  // It also helps better compress context profile to control profile
+  // size, as we now only need context profile for functions going to
+  // be inlined.
+  for (FunctionId FuncName : buildTopDownOrder()) {
+    processFunction(FuncName);
+  }
+
+  // Not inlined context profiles are merged into its base, so we can
+  // trim out such profiles from the output.
+  for (auto *Node : ContextTracker) {
+    FunctionSamples *FProfile = Node->getFunctionSamples();
+    if (FProfile &&
+        (Node->getParentContext() != &ContextTracker.getRootContext() &&
+         !FProfile->getContext().hasState(InlinedContext))) {
+      Node->setFunctionSamples(nullptr);
+    }
+  }
+  FunctionSamples::ProfileIsPreInlined = true;
+
+  LLVM_DEBUG(printProfileNames(ContextTracker, false));
+}
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.h
new file mode 100644
index 0000000000..8a3f16a4f1
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/CSPreInliner.h
@@ -0,0 +1,96 @@
+//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
+#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
+
+#include "ProfiledBinary.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Transforms/IPO/ProfiledCallGraph.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+// Inline candidate seen from profile
+struct ProfiledInlineCandidate {
+  ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count,
+                          uint32_t Size)
+      : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {}
+  // Context-sensitive function profile for inline candidate
+  const FunctionSamples *CalleeSamples;
+  // Call site count for an inline candidate
+  // TODO: make sure entry count for context profile and call site
+  // target count for corresponding call are consistent.
+  uint64_t CallsiteCount;
+  // Size proxy for function under particular call context.
+  uint64_t SizeCost;
+};
+
+// Inline candidate comparer using call site weight
+struct ProfiledCandidateComparer {
+  bool operator()(const ProfiledInlineCandidate &LHS,
+                  const ProfiledInlineCandidate &RHS) {
+    // Always prioritize inlining zero-sized functions as they do not affect the
+    // size budget. This could happen when all of the callee's code is gone and
+    // only pseudo probes are left.
+    if ((LHS.SizeCost == 0 || RHS.SizeCost == 0) &&
+        (LHS.SizeCost != RHS.SizeCost))
+      return RHS.SizeCost == 0;
+
+    if (LHS.CallsiteCount != RHS.CallsiteCount)
+      return LHS.CallsiteCount < RHS.CallsiteCount;
+
+    if (LHS.SizeCost != RHS.SizeCost)
+      return LHS.SizeCost > RHS.SizeCost;
+
+    // Tie breaker using GUID so we have stable/deterministic inlining order
+    assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+           "Expect non-null FunctionSamples");
+    return LHS.CalleeSamples->getGUID() < RHS.CalleeSamples->getGUID();
+  }
+};
+
+using ProfiledCandidateQueue =
+    PriorityQueue<ProfiledInlineCandidate, std::vector<ProfiledInlineCandidate>,
+                  ProfiledCandidateComparer>;
+
+// Pre-compilation inliner based on context-sensitive profile.
+// The PreInliner estimates inline decision using hotness from profile
+// and cost estimation from machine code size. It helps merges context
+// profile globally and achieves better post-inine profile quality, which
+// otherwise won't be possible for ThinLTO. It also reduce context profile
+// size by only keep context that is estimated to be inlined.
+class CSPreInliner {
+public:
+  CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary,
+               ProfileSummary *Summary);
+  void run();
+
+private:
+  bool getInlineCandidates(ProfiledCandidateQueue &CQueue,
+                           const FunctionSamples *FCallerContextSamples);
+  std::vector<FunctionId> buildTopDownOrder();
+  void processFunction(FunctionId Name);
+  bool shouldInline(ProfiledInlineCandidate &Candidate);
+  uint32_t getFuncSize(const ContextTrieNode *ContextNode);
+  bool UseContextCost;
+  SampleContextTracker &ContextTracker;
+  ProfiledBinary &Binary;
+  ProfileSummary *Summary;
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/CallContext.h b/tools/ldc-profgen/ldc-profgen-21.1/CallContext.h
new file mode 100644
index 0000000000..574833bfe8
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/CallContext.h
@@ -0,0 +1,58 @@
+//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H
+#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H
+
+#include "llvm/ProfileData/SampleProf.h"
+#include <sstream>
+#include <string>
+
+namespace llvm {
+namespace sampleprof {
+
+inline std::string getCallSite(const SampleContextFrame &Callsite) {
+  std::string CallsiteStr = Callsite.Func.str();
+  CallsiteStr += ":";
+  CallsiteStr += Twine(Callsite.Location.LineOffset).str();
+  if (Callsite.Location.Discriminator > 0) {
+    CallsiteStr += ".";
+    CallsiteStr += Twine(Callsite.Location.Discriminator).str();
+  }
+  return CallsiteStr;
+}
+
+// TODO: This operation is expansive. If it ever gets called multiple times we
+// may think of making a class wrapper with internal states for it.
+inline std::string getLocWithContext(const SampleContextFrameVector &Context) {
+  std::ostringstream OContextStr;
+  for (const auto &Callsite : Context) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << getCallSite(Callsite);
+  }
+  return OContextStr.str();
+}
+
+// Reverse call context, i.e., in the order of callee frames to caller frames,
+// is useful during instruction printing or pseudo probe printing.
+inline std::string
+getReversedLocWithContext(const SampleContextFrameVector &Context) {
+  std::ostringstream OContextStr;
+  for (const auto &Callsite : reverse(Context)) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << getCallSite(Callsite);
+  }
+  return OContextStr.str();
+}
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-21.1/ErrorHandling.h
new file mode 100644
index 0000000000..b797add8a8
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/ErrorHandling.h
@@ -0,0 +1,56 @@
+//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H
+#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/WithColor.h"
+#include <system_error>
+
+using namespace llvm;
+
+[[noreturn]] inline void exitWithError(const Twine &Message,
+                                       StringRef Whence = StringRef(),
+                                       StringRef Hint = StringRef()) {
+  WithColor::error(errs(), "llvm-profgen");
+  if (!Whence.empty())
+    errs() << Whence.str() << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint.str() << "\n";
+  ::exit(EXIT_FAILURE);
+}
+
+[[noreturn]] inline void exitWithError(std::error_code EC,
+                                       StringRef Whence = StringRef()) {
+  exitWithError(EC.message(), Whence);
+}
+
+[[noreturn]] inline void exitWithError(Error E, StringRef Whence) {
+  exitWithError(errorToErrorCode(std::move(E)), Whence);
+}
+
+template <typename T, typename... Ts>
+T unwrapOrError(Expected<T> EO, Ts &&... Args) {
+  if (EO)
+    return std::move(*EO);
+  exitWithError(EO.takeError(), std::forward<Ts>(Args)...);
+}
+
+inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) {
+  if (!Total || !Num)
+    return;
+  WithColor::warning() << format("%.2f", static_cast<double>(Num) * 100 / Total)
+                       << "%(" << Num << "/" << Total << ") " << Msg << "\n";
+}
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.cpp b/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.cpp
new file mode 100644
index 0000000000..edfe8979c7
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.cpp
@@ -0,0 +1,318 @@
+//===-- MissingFrameInferrer.cpp - Missing frame inferrer --------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MissingFrameInferrer.h"
+#include "PerfReader.h"
+#include "ProfiledBinary.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <queue>
+#include <sys/types.h>
+
+#define DEBUG_TYPE "missing-frame-inferrer"
+
+using namespace llvm;
+using namespace sampleprof;
+
+STATISTIC(TailCallUniReachable,
+          "Number of frame pairs reachable via a unique tail call path");
+STATISTIC(TailCallMultiReachable,
+          "Number of frame pairs reachable via a multiple tail call paths");
+STATISTIC(TailCallUnreachable,
+          "Number of frame pairs unreachable via any tail call path");
+STATISTIC(TailCallFuncSingleTailCalls,
+          "Number of functions with single tail call site");
+STATISTIC(TailCallFuncMultipleTailCalls,
+          "Number of functions with multiple tail call sites");
+STATISTIC(TailCallMaxTailCallPath, "Length of the longest tail call path");
+
+static cl::opt<uint32_t>
+    MaximumSearchDepth("max-search-depth", cl::init(UINT32_MAX - 1),
+                       cl::desc("The maximum levels the DFS-based missing "
+                                "frame search should go with"));
+
+void MissingFrameInferrer::initialize(
+    const ContextSampleCounterMap *SampleCounters) {
+  // Refine call edges based on LBR samples.
+  if (SampleCounters) {
+    std::unordered_map<uint64_t, std::unordered_set<uint64_t>> SampledCalls;
+    std::unordered_map<uint64_t, std::unordered_set<uint64_t>> SampledTailCalls;
+
+    // Populate SampledCalls based on static call sites. Similarly to
+    // SampledTailCalls.
+    for (const auto &CI : *SampleCounters) {
+      for (auto Item : CI.second.BranchCounter) {
+        auto From = Item.first.first;
+        auto To = Item.first.second;
+        if (CallEdges.count(From)) {
+          assert(CallEdges[From].size() == 1 &&
+                 "A callsite should only appear once with either a known or a "
+                 "zero (unknown) target value at this point");
+          SampledCalls[From].insert(To);
+        }
+        if (TailCallEdges.count(From)) {
+          assert(TailCallEdges[From].size() == 1 &&
+                 "A callsite should only appear once with either a known or a "
+                 "zero (unknown) target value at this point");
+          FuncRange *FromFRange = Binary->findFuncRange(From);
+          FuncRange *ToFRange = Binary->findFuncRange(To);
+          if (FromFRange != ToFRange)
+            SampledTailCalls[From].insert(To);
+        }
+      }
+    }
+
+    // Replace static edges with dynamic edges.
+    CallEdges = SampledCalls;
+    TailCallEdges = SampledTailCalls;
+  }
+
+  // Populate function-based edges. This is to speed up address to function
+  // translation.
+  for (auto Call : CallEdges)
+    for (auto Target : Call.second)
+      if (FuncRange *ToFRange = Binary->findFuncRange(Target))
+        CallEdgesF[Call.first].insert(ToFRange->Func);
+
+  for (auto Call : TailCallEdges) {
+    for (auto Target : Call.second) {
+      if (FuncRange *ToFRange = Binary->findFuncRange(Target)) {
+        TailCallEdgesF[Call.first].insert(ToFRange->Func);
+        TailCallTargetFuncs.insert(ToFRange->Func);
+      }
+    }
+    if (FuncRange *FromFRange = Binary->findFuncRange(Call.first))
+      FuncToTailCallMap[FromFRange->Func].push_back(Call.first);
+  }
+
+#if LLVM_ENABLE_STATS
+  for (auto F : FuncToTailCallMap) {
+    assert(F.second.size() > 0 && "");
+    if (F.second.size() > 1)
+      TailCallFuncMultipleTailCalls++;
+    else
+      TailCallFuncSingleTailCalls++;
+  }
+#endif
+
+#ifndef NDEBUG
+  auto PrintCallTargets =
+      [&](const std::unordered_map<uint64_t, std::unordered_set<uint64_t>>
+              &CallTargets,
+          bool IsTailCall) {
+        for (const auto &Targets : CallTargets) {
+          for (const auto &Target : Targets.second) {
+            dbgs() << (IsTailCall ? "TailCall" : "Call");
+            dbgs() << " From " << format("%8" PRIx64, Targets.first) << " to "
+                   << format("%8" PRIx64, Target) << "\n";
+          }
+        }
+      };
+
+  LLVM_DEBUG(dbgs() << "============================\n ";
+             dbgs() << "Call targets:\n";
+             PrintCallTargets(CallEdges, false);
+             dbgs() << "\nTail call targets:\n";
+             PrintCallTargets(CallEdges, true);
+             dbgs() << "============================\n";);
+#endif
+}
+
+uint64_t MissingFrameInferrer::computeUniqueTailCallPath(
+    BinaryFunction *From, BinaryFunction *To, SmallVectorImpl<uint64_t> &Path) {
+  // Search for a unique path comprised of only tail call edges for a given
+  // source and target frame address on the a tail call graph that consists of
+  // only tail call edges. Note that only a unique path counts. Multiple paths
+  // are treated unreachable.
+  if (From == To)
+    return 1;
+
+  // Ignore cyclic paths. Since we are doing a recursive DFS walk, if the source
+  // frame being visited is already in the stack, it means we are seeing a
+  // cycle. This is done before querying the cached result because the cached
+  // result may be computed based on the same path. Consider the following case:
+  //     A -> B, B -> A, A -> D
+  // When computing unique reachablity from A to D, the cached result for (B,D)
+  // should not be counted since the unique path B->A->D is basically the same
+  // path as A->D. Counting that with invalidate the uniqueness from A to D.
+  if (Visiting.contains(From))
+    return 0;
+
+  // If already computed, return the cached result.
+  auto I = UniquePaths.find({From, To});
+  if (I != UniquePaths.end()) {
+    Path.append(I->second.begin(), I->second.end());
+    return 1;
+  }
+
+  auto J = NonUniquePaths.find({From, To});
+  if (J != NonUniquePaths.end()) {
+    return J->second;
+  }
+
+  uint64_t Pos = Path.size();
+
+  // DFS walk each outgoing tail call edges.
+  // Bail out if we are already at the the maximum searching depth.
+  if (CurSearchingDepth == MaximumSearchDepth)
+    return 0;
+
+  auto It = FuncToTailCallMap.find(From);
+  if (It == FuncToTailCallMap.end())
+    return 0;
+
+  CurSearchingDepth++;
+  Visiting.insert(From);
+  uint64_t NumPaths = 0;
+  for (auto TailCall : It->second) {
+    NumPaths += computeUniqueTailCallPath(TailCall, To, Path);
+    // Stop analyzing the remaining if we are already seeing more than one
+    // reachable paths.
+    if (NumPaths > 1)
+      break;
+  }
+  CurSearchingDepth--;
+  Visiting.erase(From);
+
+  // Undo already-computed path if it is not unique.
+  if (NumPaths != 1) {
+    Path.pop_back_n(Path.size() - Pos);
+  }
+
+  // Cache the result.
+  if (NumPaths == 1) {
+    UniquePaths[{From, To}].assign(Path.begin() + Pos, Path.end());
+#if LLVM_ENABLE_STATS
+    auto &LocalPath = UniquePaths[{From, To}];
+    assert((LocalPath.size() <= MaximumSearchDepth + 1) &&
+           "Path should not be longer than the maximum searching depth");
+    TailCallMaxTailCallPath = std::max(uint64_t(LocalPath.size()),
+                                       TailCallMaxTailCallPath.getValue());
+#endif
+  } else {
+    NonUniquePaths[{From, To}] = NumPaths;
+  }
+
+  return NumPaths;
+}
+
+uint64_t MissingFrameInferrer::computeUniqueTailCallPath(
+    uint64_t From, BinaryFunction *To, SmallVectorImpl<uint64_t> &Path) {
+  auto It = TailCallEdgesF.find(From);
+  if (It == TailCallEdgesF.end())
+    return 0;
+  Path.push_back(From);
+  uint64_t NumPaths = 0;
+  for (auto Target : It->second) {
+    NumPaths += computeUniqueTailCallPath(Target, To, Path);
+    // Stop analyzing the remaining if we are already seeing more than one
+    // reachable paths.
+    if (NumPaths > 1)
+      break;
+  }
+
+  // Undo already-computed path if it is not unique.
+  if (NumPaths != 1)
+    Path.pop_back();
+  return NumPaths;
+}
+
+bool MissingFrameInferrer::inferMissingFrames(
+    uint64_t From, uint64_t To, SmallVectorImpl<uint64_t> &UniquePath) {
+  assert(!TailCallEdgesF.count(From) &&
+         "transition between From and To cannot be via a tailcall otherwise "
+         "they would not show up at the same time");
+  UniquePath.push_back(From);
+  uint64_t Pos = UniquePath.size();
+
+  FuncRange *ToFRange = Binary->findFuncRange(To);
+  if (!ToFRange)
+    return false;
+
+  // Bail out if caller has no known outgoing call edges.
+  auto It = CallEdgesF.find(From);
+  if (It == CallEdgesF.end())
+    return false;
+
+  // Done with the inference if the calle is reachable via a single callsite.
+  // This may not be accurate but it improves the search throughput.
+  if (llvm::is_contained(It->second, ToFRange->Func))
+    return true;
+
+  // Bail out if callee is not tailcall reachable at all.
+  if (!TailCallTargetFuncs.contains(ToFRange->Func))
+    return false;
+
+  Visiting.clear();
+  CurSearchingDepth = 0;
+  uint64_t NumPaths = 0;
+  for (auto Target : It->second) {
+    NumPaths +=
+        computeUniqueTailCallPath(Target, ToFRange->Func, UniquePath);
+    // Stop analyzing the remaining if we are already seeing more than one
+    // reachable paths.
+    if (NumPaths > 1)
+      break;
+  }
+
+  // Undo already-computed path if it is not unique.
+  if (NumPaths != 1) {
+    UniquePath.pop_back_n(UniquePath.size() - Pos);
+    assert(UniquePath.back() == From && "broken path");
+  }
+
+#if LLVM_ENABLE_STATS
+  if (NumPaths == 1) {
+    if (ReachableViaUniquePaths.insert({From, ToFRange->StartAddress}).second)
+      TailCallUniReachable++;
+  } else if (NumPaths == 0) {
+    if (Unreachables.insert({From, ToFRange->StartAddress}).second) {
+      TailCallUnreachable++;
+      LLVM_DEBUG(dbgs() << "No path found from "
+                        << format("%8" PRIx64 ":", From) << " to "
+                        << format("%8" PRIx64 ":", ToFRange->StartAddress)
+                        << "\n");
+    }
+  } else if (NumPaths > 1) {
+    if (ReachableViaMultiPaths.insert({From, ToFRange->StartAddress})
+            .second) {
+      TailCallMultiReachable++;
+      LLVM_DEBUG(dbgs() << "Multiple paths found from "
+                        << format("%8" PRIx64 ":", From) << " to "
+                        << format("%8" PRIx64 ":", ToFRange->StartAddress)
+                        << "\n");
+    }
+  }
+#endif
+
+  return NumPaths == 1;
+}
+
+void MissingFrameInferrer::inferMissingFrames(
+    const SmallVectorImpl<uint64_t> &Context,
+    SmallVectorImpl<uint64_t> &NewContext) {
+  if (Context.size() == 1) {
+    NewContext = Context;
+    return;
+  }
+
+  NewContext.clear();
+  for (uint64_t I = 1; I < Context.size(); I++) {
+    inferMissingFrames(Context[I - 1], Context[I], NewContext);
+  }
+  NewContext.push_back(Context.back());
+
+  assert((NewContext.size() >= Context.size()) &&
+         "Inferred context should include all frames in the original context");
+  assert((NewContext.size() > Context.size() || NewContext == Context) &&
+         "Inferred context should be exactly the same "
+         "with the original context");
+}
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.h b/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.h
new file mode 100644
index 0000000000..4680a9a979
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/MissingFrameInferrer.h
@@ -0,0 +1,116 @@
+//===-- MissingFrameInferrer.h -  Missing frame inferrer ---------- C++/-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H
+#define LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H
+
+#include "PerfReader.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+namespace sampleprof {
+
+class ProfiledBinary;
+struct BinaryFunction;
+
+class MissingFrameInferrer {
+public:
+  MissingFrameInferrer(ProfiledBinary *Binary) : Binary(Binary) {}
+
+  // Defininig a frame transition from a caller function to the callee function.
+  using CallerCalleePair = std::pair<BinaryFunction *, BinaryFunction *>;
+
+  void initialize(const ContextSampleCounterMap *SampleCounters);
+
+  // Given an input `Context`, output `NewContext` with inferred missing tail
+  // call frames.
+  void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context,
+                          SmallVectorImpl<uint64_t> &NewContext);
+
+private:
+  friend class ProfiledBinary;
+
+  // Compute a unique tail call path for a pair of source frame address and
+  // target frame address. Append the unique path prefix (not including `To`) to
+  // `UniquePath` if exists. Return the whether this's a unqiue tail call
+  // path. The source/dest frame will typically be a pair of adjacent frame
+  // entries of call stack samples.
+  bool inferMissingFrames(uint64_t From, uint64_t To,
+                          SmallVectorImpl<uint64_t> &UniquePath);
+
+  // Compute a unique tail call path from the source frame address to the target
+  // function. Output the unique path prefix (not including `To`) in
+  // `UniquePath` if exists. Return the number of possibly availabe tail call
+  // paths.
+  uint64_t computeUniqueTailCallPath(uint64_t From, BinaryFunction *To,
+                                     SmallVectorImpl<uint64_t> &UniquePath);
+
+  // Compute a unique tail call path from the source function to the target
+  // function. Output the unique path prefix (not including `To`) in
+  // `UniquePath` if exists. Return the number of possibly availabe tail call
+  // paths.
+  uint64_t computeUniqueTailCallPath(BinaryFunction *From, BinaryFunction *To,
+                                     SmallVectorImpl<uint64_t> &UniquePath);
+
+  ProfiledBinary *Binary;
+
+  // A map of call instructions to their target addresses. This is first
+  // populated with static call edges but then trimmed down to dynamic call
+  // edges based on LBR samples.
+  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> CallEdges;
+
+  // A map of tail call instructions to their target addresses. This is first
+  // populated with static call edges but then trimmed down to dynamic call
+  // edges based on LBR samples.
+  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> TailCallEdges;
+
+  // Dynamic call targets in terms of BinaryFunction for any calls.
+  std::unordered_map<uint64_t, std::unordered_set<BinaryFunction *>> CallEdgesF;
+
+  // Dynamic call targets in terms of BinaryFunction  for tail calls.
+  std::unordered_map<uint64_t, std::unordered_set<BinaryFunction *>>
+      TailCallEdgesF;
+
+  // Dynamic tail call targets of caller functions.
+  std::unordered_map<BinaryFunction *, std::vector<uint64_t>> FuncToTailCallMap;
+
+  // Functions that are reachable via tail calls.
+  DenseSet<const BinaryFunction *> TailCallTargetFuncs;
+
+  struct PairHash {
+    std::size_t operator()(
+        const std::pair<BinaryFunction *, BinaryFunction *> &Pair) const {
+      return std::hash<BinaryFunction *>()(Pair.first) ^
+             std::hash<BinaryFunction *>()(Pair.second);
+    }
+  };
+
+  // Cached results from a CallerCalleePair to a unique call path between them.
+  std::unordered_map<CallerCalleePair, std::vector<uint64_t>, PairHash>
+      UniquePaths;
+  // Cached results from CallerCalleePair to the number of available call paths.
+  std::unordered_map<CallerCalleePair, uint64_t, PairHash> NonUniquePaths;
+
+  DenseSet<BinaryFunction *> Visiting;
+
+  uint32_t CurSearchingDepth = 0;
+
+#if LLVM_ENABLE_STATS
+  DenseSet<std::pair<uint64_t, uint64_t>> ReachableViaUniquePaths;
+  DenseSet<std::pair<uint64_t, uint64_t>> Unreachables;
+  DenseSet<std::pair<uint64_t, uint64_t>> ReachableViaMultiPaths;
+#endif
+};
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.cpp
new file mode 100644
index 0000000000..ad113eda27
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.cpp
@@ -0,0 +1,1286 @@
+//===-- PerfReader.cpp - perfscript reader  ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "PerfReader.h"
+#include "ProfileGenerator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#define DEBUG_TYPE "perf-reader"
+
+cl::opt<bool> SkipSymbolization("skip-symbolization",
+                                cl::desc("Dump the unsymbolized profile to the "
+                                         "output file. It will show unwinder "
+                                         "output for CS profile generation."));
+
+static cl::opt<bool> ShowMmapEvents("show-mmap-events",
+                                    cl::desc("Print binary load events."));
+
+static cl::opt<bool>
+    UseOffset("use-offset", cl::init(true),
+              cl::desc("Work with `--skip-symbolization` or "
+                       "`--unsymbolized-profile` to write/read the "
+                       "offset instead of virtual address."));
+
+static cl::opt<bool> UseLoadableSegmentAsBase(
+    "use-first-loadable-segment-as-base",
+    cl::desc("Use first loadable segment address as base address "
+             "for offsets in unsymbolized profile. By default "
+             "first executable segment address is used"));
+
+static cl::opt<bool>
+    IgnoreStackSamples("ignore-stack-samples",
+                       cl::desc("Ignore call stack samples for hybrid samples "
+                                "and produce context-insensitive profile."));
+cl::opt<bool> ShowDetailedWarning("show-detailed-warning",
+                                  cl::desc("Show detailed warning message."));
+
+static cl::opt<int> CSProfMaxUnsymbolizedCtxDepth(
+    "csprof-max-unsymbolized-context-depth", cl::init(-1),
+    cl::desc("Keep the last K contexts while merging unsymbolized profile. -1 "
+             "means no depth limit."));
+
+extern cl::opt<std::string> PerfTraceFilename;
+extern cl::opt<bool> ShowDisassemblyOnly;
+extern cl::opt<bool> ShowSourceLocations;
+extern cl::opt<std::string> OutputFilename;
+
+namespace llvm {
+namespace sampleprof {
+
+void VirtualUnwinder::unwindCall(UnwindState &State) {
+  uint64_t Source = State.getCurrentLBRSource();
+  auto *ParentFrame = State.getParentFrame();
+  // The 2nd frame after leaf could be missing if stack sample is
+  // taken when IP is within prolog/epilog, as frame chain isn't
+  // setup yet. Fill in the missing frame in that case.
+  // TODO: Currently we just assume all the addr that can't match the
+  // 2nd frame is in prolog/epilog. In the future, we will switch to
+  // pro/epi tracker(Dwarf CFI) for the precise check.
+  if (ParentFrame == State.getDummyRootPtr() ||
+      ParentFrame->Address != Source) {
+    State.switchToFrame(Source);
+    if (ParentFrame != State.getDummyRootPtr()) {
+      if (Source == ExternalAddr)
+        NumMismatchedExtCallBranch++;
+      else
+        NumMismatchedProEpiBranch++;
+    }
+  } else {
+    State.popFrame();
+  }
+  State.InstPtr.update(Source);
+}
+
+void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
+  InstructionPointer &IP = State.InstPtr;
+  uint64_t Target = State.getCurrentLBRTarget();
+  uint64_t End = IP.Address;
+
+  if (End == ExternalAddr && Target == ExternalAddr) {
+    // Filter out the case when leaf external frame matches the external LBR
+    // target, this is a valid state, it happens that the code run into external
+    // address then return back.  The call frame under the external frame
+    // remains valid and can be unwound later, just skip recording this range.
+    NumPairedExtAddr++;
+    return;
+  }
+
+  if (End == ExternalAddr || Target == ExternalAddr) {
+    // Range is invalid if only one point is external address. This means LBR
+    // traces contains a standalone external address failing to pair another
+    // one, likely due to interrupt jmp or broken perf script. Set the
+    // state to invalid.
+    NumUnpairedExtAddr++;
+    State.setInvalid();
+    return;
+  }
+
+  if (!isValidFallThroughRange(Target, End, Binary)) {
+    // Skip unwinding the rest of LBR trace when a bogus range is seen.
+    State.setInvalid();
+    return;
+  }
+
+  if (Binary->usePseudoProbes()) {
+    // We don't need to top frame probe since it should be extracted
+    // from the range.
+    // The outcome of the virtual unwinding with pseudo probes is a
+    // map from a context key to the address range being unwound.
+    // This means basically linear unwinding is not needed for pseudo
+    // probes. The range will be simply recorded here and will be
+    // converted to a list of pseudo probes to report in ProfileGenerator.
+    State.getParentFrame()->recordRangeCount(Target, End, Repeat);
+  } else {
+    // Unwind linear execution part.
+    // Split and record the range by different inline context. For example:
+    // [0x01] ... main:1          # Target
+    // [0x02] ... main:2
+    // [0x03] ... main:3 @ foo:1
+    // [0x04] ... main:3 @ foo:2
+    // [0x05] ... main:3 @ foo:3
+    // [0x06] ... main:4
+    // [0x07] ... main:5          # End
+    // It will be recorded:
+    // [main:*]         : [0x06, 0x07], [0x01, 0x02]
+    // [main:3 @ foo:*] : [0x03, 0x05]
+    while (IP.Address > Target) {
+      uint64_t PrevIP = IP.Address;
+      IP.backward();
+      // Break into segments for implicit call/return due to inlining
+      bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address);
+      if (!SameInlinee) {
+        State.switchToFrame(PrevIP);
+        State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat);
+        End = IP.Address;
+      }
+    }
+    assert(IP.Address == Target && "The last one must be the target address.");
+    // Record the remaining range, [0x01, 0x02] in the example
+    State.switchToFrame(IP.Address);
+    State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat);
+  }
+}
+
+void VirtualUnwinder::unwindReturn(UnwindState &State) {
+  // Add extra frame as we unwind through the return
+  const LBREntry &LBR = State.getCurrentLBR();
+  uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target);
+  State.switchToFrame(CallAddr);
+  State.pushFrame(LBR.Source);
+  State.InstPtr.update(LBR.Source);
+}
+
+void VirtualUnwinder::unwindBranch(UnwindState &State) {
+  // TODO: Tolerate tail call for now, as we may see tail call from libraries.
+  // This is only for intra function branches, excluding tail calls.
+  uint64_t Source = State.getCurrentLBRSource();
+  State.switchToFrame(Source);
+  State.InstPtr.update(Source);
+}
+
+std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
+  std::shared_ptr<StringBasedCtxKey> KeyStr =
+      std::make_shared<StringBasedCtxKey>();
+  KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined);
+  return KeyStr;
+}
+
+std::shared_ptr<AddrBasedCtxKey> AddressStack::getContextKey() {
+  std::shared_ptr<AddrBasedCtxKey> KeyStr = std::make_shared<AddrBasedCtxKey>();
+  KeyStr->Context = Stack;
+  CSProfileGenerator::compressRecursionContext<uint64_t>(KeyStr->Context);
+  // MaxContextDepth(--csprof-max-context-depth) is used to trim both symbolized
+  // and unsymbolized profile context. Sometimes we want to at least preserve
+  // the inlinings for the leaf frame(the profiled binary inlining),
+  // --csprof-max-context-depth may not be flexible enough, in this case,
+  // --csprof-max-unsymbolized-context-depth is used to limit the context for
+  // unsymbolized profile. If both are set, use the minimum of them.
+  int Depth = CSProfileGenerator::MaxContextDepth != -1
+                  ? CSProfileGenerator::MaxContextDepth
+                  : KeyStr->Context.size();
+  Depth = CSProfMaxUnsymbolizedCtxDepth != -1
+              ? std::min(static_cast<int>(CSProfMaxUnsymbolizedCtxDepth), Depth)
+              : Depth;
+  CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context, Depth);
+  return KeyStr;
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
+                                              T &Stack) {
+  if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
+    return;
+
+  std::shared_ptr<ContextKey> Key = Stack.getContextKey();
+  if (Key == nullptr)
+    return;
+  auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
+  SampleCounter &SCounter = Ret.first->second;
+  for (auto &I : Cur->RangeSamples)
+    SCounter.recordRangeCount(std::get<0>(I), std::get<1>(I), std::get<2>(I));
+
+  for (auto &I : Cur->BranchSamples)
+    SCounter.recordBranchCount(std::get<0>(I), std::get<1>(I), std::get<2>(I));
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur, T &Stack) {
+  if (!Cur->isDummyRoot()) {
+    // Truncate the context for external frame since this isn't a real call
+    // context the compiler will see.
+    if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) {
+      // Process truncated context
+      // Start a new traversal ignoring its bottom context
+      T EmptyStack(Binary);
+      collectSamplesFromFrame(Cur, EmptyStack);
+      for (const auto &Item : Cur->Children) {
+        collectSamplesFromFrameTrie(Item.second.get(), EmptyStack);
+      }
+
+      // Keep note of untracked call site and deduplicate them
+      // for warning later.
+      if (!Cur->isLeafFrame())
+        UntrackedCallsites.insert(Cur->Address);
+
+      return;
+    }
+  }
+
+  collectSamplesFromFrame(Cur, Stack);
+  // Process children frame
+  for (const auto &Item : Cur->Children) {
+    collectSamplesFromFrameTrie(Item.second.get(), Stack);
+  }
+  // Recover the call stack
+  Stack.popFrame();
+}
+
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur) {
+  if (Binary->usePseudoProbes()) {
+    AddressStack Stack(Binary);
+    collectSamplesFromFrameTrie<AddressStack>(Cur, Stack);
+  } else {
+    FrameStack Stack(Binary);
+    collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
+  }
+}
+
+void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
+                                        UnwindState &State, uint64_t Repeat) {
+  if (Branch.Target == ExternalAddr)
+    return;
+
+  // Record external-to-internal pattern on the trie root, it later can be
+  // used for generating head samples.
+  if (Branch.Source == ExternalAddr) {
+    State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target,
+                                               Repeat);
+    return;
+  }
+
+  if (Binary->usePseudoProbes()) {
+    // Same as recordRangeCount, We don't need to top frame probe since we will
+    // extract it from branch's source address
+    State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  } else {
+    State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  }
+}
+
+bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) {
+  // Capture initial state as starting point for unwinding.
+  UnwindState State(Sample, Binary);
+
+  // Sanity check - making sure leaf of LBR aligns with leaf of stack sample
+  // Stack sample sometimes can be unreliable, so filter out bogus ones.
+  if (!State.validateInitialState())
+    return false;
+
+  NumTotalBranches += State.LBRStack.size();
+  // Now process the LBR samples in parrallel with stack sample
+  // Note that we do not reverse the LBR entry order so we can
+  // unwind the sample stack as we walk through LBR entries.
+  while (State.hasNextLBR()) {
+    State.checkStateConsistency();
+
+    // Do not attempt linear unwind for the leaf range as it's incomplete.
+    if (!State.IsLastLBR()) {
+      // Unwind implicit calls/returns from inlining, along the linear path,
+      // break into smaller sub section each with its own calling context.
+      unwindLinear(State, Repeat);
+    }
+
+    // Save the LBR branch before it gets unwound.
+    const LBREntry &Branch = State.getCurrentLBR();
+    if (isCallState(State)) {
+      // Unwind calls - we know we encountered call if LBR overlaps with
+      // transition between leaf the 2nd frame. Note that for calls that
+      // were not in the original stack sample, we should have added the
+      // extra frame when processing the return paired with this call.
+      unwindCall(State);
+    } else if (isReturnState(State)) {
+      // Unwind returns - check whether the IP is indeed at a return
+      // instruction
+      unwindReturn(State);
+    } else if (isValidState(State)) {
+      // Unwind branches
+      unwindBranch(State);
+    } else {
+      // Skip unwinding the rest of LBR trace. Reset the stack and update the
+      // state so that the rest of the trace can still be processed as if they
+      // do not have stack samples.
+      State.clearCallStack();
+      State.InstPtr.update(State.getCurrentLBRSource());
+      State.pushFrame(State.InstPtr.Address);
+    }
+
+    State.advanceLBR();
+    // Record `branch` with calling context after unwinding.
+    recordBranchCount(Branch, State, Repeat);
+  }
+  // As samples are aggregated on trie, record them into counter map
+  collectSamplesFromFrameTrie(State.getDummyRootPtr());
+
+  return true;
+}
+
+std::unique_ptr<PerfReaderBase>
+PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput,
+                       std::optional<int32_t> PIDFilter) {
+  std::unique_ptr<PerfReaderBase> PerfReader;
+
+  if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) {
+    PerfReader.reset(
+        new UnsymbolizedProfileReader(Binary, PerfInput.InputFile));
+    return PerfReader;
+  }
+
+  // For perf data input, we need to convert them into perf script first.
+  // If this is a kernel perf file, there is no need for retrieving PIDs.
+  if (PerfInput.Format == PerfFormat::PerfData)
+    PerfInput = PerfScriptReader::convertPerfDataToTrace(
+        Binary, Binary->isKernel(), PerfInput, PIDFilter);
+
+  assert((PerfInput.Format == PerfFormat::PerfScript) &&
+         "Should be a perfscript!");
+
+  PerfInput.Content =
+      PerfScriptReader::checkPerfScriptType(PerfInput.InputFile);
+  if (PerfInput.Content == PerfContent::LBRStack) {
+    PerfReader.reset(
+        new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter));
+  } else if (PerfInput.Content == PerfContent::LBR) {
+    PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter));
+  } else {
+    exitWithError("Unsupported perfscript!");
+  }
+
+  return PerfReader;
+}
+
+PerfInputFile
+PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, bool SkipPID,
+                                         PerfInputFile &File,
+                                         std::optional<int32_t> PIDFilter) {
+  StringRef PerfData = File.InputFile;
+  // Run perf script to retrieve PIDs matching binary we're interested in.
+  auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf");
+  if (!PerfExecutable) {
+    exitWithError("Perf not found.");
+  }
+  std::string PerfPath = *PerfExecutable;
+  SmallString<128> PerfTraceFile;
+  sys::fs::createUniquePath("perf-script-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%.tmp",
+                            PerfTraceFile, /*MakeAbsolute=*/true);
+  std::string ErrorFile = std::string(PerfTraceFile) + ".err";
+  std::optional<StringRef> Redirects[] = {std::nullopt,             // Stdin
+                                          StringRef(PerfTraceFile), // Stdout
+                                          StringRef(ErrorFile)};    // Stderr
+  PerfScriptReader::TempFileCleanups.emplace_back(PerfTraceFile);
+  PerfScriptReader::TempFileCleanups.emplace_back(ErrorFile);
+
+  std::string PIDs;
+  if (!SkipPID) {
+    StringRef ScriptMMapArgs[] = {PerfPath, "script",   "--show-mmap-events",
+                                  "-F",     "comm,pid", "-i",
+                                  PerfData};
+    sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, std::nullopt, Redirects);
+
+    // Collect the PIDs
+    TraceStream TraceIt(PerfTraceFile);
+    std::unordered_set<int32_t> PIDSet;
+    while (!TraceIt.isAtEoF()) {
+      MMapEvent MMap;
+      if (isMMapEvent(TraceIt.getCurrentLine()) &&
+          extractMMapEventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) {
+        auto It = PIDSet.emplace(MMap.PID);
+        if (It.second && (!PIDFilter || MMap.PID == *PIDFilter)) {
+          if (!PIDs.empty()) {
+            PIDs.append(",");
+          }
+          PIDs.append(utostr(MMap.PID));
+        }
+      }
+      TraceIt.advance();
+    }
+
+    if (PIDs.empty()) {
+      exitWithError("No relevant mmap event is found in perf data.");
+    }
+  }
+
+  // Run perf script again to retrieve events for PIDs collected above
+  SmallVector<StringRef, 8> ScriptSampleArgs;
+  ScriptSampleArgs.push_back(PerfPath);
+  ScriptSampleArgs.push_back("script");
+  ScriptSampleArgs.push_back("--show-mmap-events");
+  ScriptSampleArgs.push_back("-F");
+  ScriptSampleArgs.push_back("ip,brstack");
+  ScriptSampleArgs.push_back("-i");
+  ScriptSampleArgs.push_back(PerfData);
+  if (!PIDs.empty()) {
+    ScriptSampleArgs.push_back("--pid");
+    ScriptSampleArgs.push_back(PIDs);
+  }
+  sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects);
+
+  return {std::string(PerfTraceFile), PerfFormat::PerfScript,
+          PerfContent::UnknownContent};
+}
+
+static StringRef filename(StringRef Path, bool UseBackSlash) {
+  llvm::sys::path::Style PathStyle =
+      UseBackSlash ? llvm::sys::path::Style::windows_backslash
+                   : llvm::sys::path::Style::native;
+  StringRef FileName = llvm::sys::path::filename(Path, PathStyle);
+
+  // In case this file use \r\n as newline.
+  if (UseBackSlash && FileName.back() == '\r')
+    return FileName.drop_back();
+
+  return FileName;
+}
+
+void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) {
+  // Drop the event which doesn't belong to user-provided binary
+  StringRef BinaryName = filename(Event.BinaryPath, Binary->isCOFF());
+  bool IsKernel = Binary->isKernel();
+  if (!IsKernel && Binary->getName() != BinaryName)
+    return;
+  if (IsKernel && !Binary->isKernelImageName(BinaryName))
+    return;
+
+  // Drop the event if process does not match pid filter
+  if (PIDFilter && Event.PID != *PIDFilter)
+    return;
+
+  // Drop the event if its image is loaded at the same address
+  if (Event.Address == Binary->getBaseAddress()) {
+    Binary->setIsLoadedByMMap(true);
+    return;
+  }
+
+  if (IsKernel || Event.Offset == Binary->getTextSegmentOffset()) {
+    // A binary image could be unloaded and then reloaded at different
+    // place, so update binary load address.
+    // Only update for the first executable segment and assume all other
+    // segments are loaded at consecutive memory addresses, which is the case on
+    // X64.
+    Binary->setBaseAddress(Event.Address);
+    Binary->setIsLoadedByMMap(true);
+  } else {
+    // Verify segments are loaded consecutively.
+    const auto &Offsets = Binary->getTextSegmentOffsets();
+    auto It = llvm::lower_bound(Offsets, Event.Offset);
+    if (It != Offsets.end() && *It == Event.Offset) {
+      // The event is for loading a separate executable segment.
+      auto I = std::distance(Offsets.begin(), It);
+      const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses();
+      if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() !=
+          Event.Address - Binary->getBaseAddress())
+        exitWithError("Executable segments not loaded consecutively");
+    } else {
+      if (It == Offsets.begin())
+        exitWithError("File offset not found");
+      else {
+        // Find the segment the event falls in. A large segment could be loaded
+        // via multiple mmap calls with consecutive memory addresses.
+        --It;
+        assert(*It < Event.Offset);
+        if (Event.Offset - *It != Event.Address - Binary->getBaseAddress())
+          exitWithError("Segment not loaded by consecutive mmaps");
+      }
+    }
+  }
+}
+
+static std::string getContextKeyStr(ContextKey *K,
+                                    const ProfiledBinary *Binary) {
+  if (const auto *CtxKey = dyn_cast<StringBasedCtxKey>(K)) {
+    return SampleContext::getContextString(CtxKey->Context);
+  } else if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(K)) {
+    std::ostringstream OContextStr;
+    for (uint32_t I = 0; I < CtxKey->Context.size(); I++) {
+      if (OContextStr.str().size())
+        OContextStr << " @ ";
+      uint64_t Address = CtxKey->Context[I];
+      if (UseOffset) {
+        if (UseLoadableSegmentAsBase)
+          Address -= Binary->getFirstLoadableAddress();
+        else
+          Address -= Binary->getPreferredBaseAddress();
+      }
+      OContextStr << "0x"
+                  << utohexstr(Address,
+                               /*LowerCase=*/true);
+    }
+    return OContextStr.str();
+  } else {
+    llvm_unreachable("unexpected key type");
+  }
+}
+
+void HybridPerfReader::unwindSamples() {
+  VirtualUnwinder Unwinder(&SampleCounters, Binary);
+  for (const auto &Item : AggregatedSamples) {
+    const PerfSample *Sample = Item.first.getPtr();
+    Unwinder.unwind(Sample, Item.second);
+  }
+
+  // Warn about untracked frames due to missing probes.
+  if (ShowDetailedWarning) {
+    for (auto Address : Unwinder.getUntrackedCallsites())
+      WithColor::warning() << "Profile context truncated due to missing probe "
+                           << "for call instruction at "
+                           << format("0x%" PRIx64, Address) << "\n";
+  }
+
+  emitWarningSummary(Unwinder.getUntrackedCallsites().size(),
+                     SampleCounters.size(),
+                     "of profiled contexts are truncated due to missing probe "
+                     "for call instruction.");
+
+  emitWarningSummary(
+      Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches,
+      "of branches'source is a call instruction but doesn't match call frame "
+      "stack, likely due to unwinding error of external frame.");
+
+  emitWarningSummary(Unwinder.NumPairedExtAddr * 2, Unwinder.NumTotalBranches,
+                     "of branches containing paired external address.");
+
+  emitWarningSummary(Unwinder.NumUnpairedExtAddr, Unwinder.NumTotalBranches,
+                     "of branches containing external address but doesn't have "
+                     "another external address to pair, likely due to "
+                     "interrupt jmp or broken perf script.");
+
+  emitWarningSummary(
+      Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches,
+      "of branches'source is a call instruction but doesn't match call frame "
+      "stack, likely due to frame in prolog/epilog.");
+
+  emitWarningSummary(Unwinder.NumMissingExternalFrame,
+                     Unwinder.NumExtCallBranch,
+                     "of artificial call branches but doesn't have an external "
+                     "frame to match.");
+}
+
+bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
+                                       SmallVectorImpl<LBREntry> &LBRStack) {
+  // The raw format of LBR stack is like:
+  // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
+  //                           ... 0x4005c8/0x4005dc/P/-/-/0
+  // It's in FIFO order and separated by whitespace.
+  SmallVector<StringRef, 32> Records;
+  TraceIt.getCurrentLine().rtrim().split(Records, " ", -1, false);
+  auto WarnInvalidLBR = [](TraceStream &TraceIt) {
+    WithColor::warning() << "Invalid address in LBR record at line "
+                         << TraceIt.getLineNumber() << ": "
+                         << TraceIt.getCurrentLine() << "\n";
+  };
+
+  // Skip the leading instruction pointer.
+  size_t Index = 0;
+  uint64_t LeadingAddr;
+  if (!Records.empty() && !Records[0].contains('/')) {
+    if (Records[0].getAsInteger(16, LeadingAddr)) {
+      WarnInvalidLBR(TraceIt);
+      TraceIt.advance();
+      return false;
+    }
+    Index = 1;
+  }
+
+  // Now extract LBR samples - note that we do not reverse the
+  // LBR entry order so we can unwind the sample stack as we walk
+  // through LBR entries.
+  while (Index < Records.size()) {
+    auto &Token = Records[Index++];
+    if (Token.size() == 0)
+      continue;
+
+    SmallVector<StringRef, 8> Addresses;
+    Token.split(Addresses, "/");
+    uint64_t Src;
+    uint64_t Dst;
+
+    // Stop at broken LBR records.
+    if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) ||
+        Addresses[1].substr(2).getAsInteger(16, Dst)) {
+      WarnInvalidLBR(TraceIt);
+      break;
+    }
+
+    // Canonicalize to use preferred load address as base address.
+    Src = Binary->canonicalizeVirtualAddress(Src);
+    Dst = Binary->canonicalizeVirtualAddress(Dst);
+    bool SrcIsInternal = Binary->addressIsCode(Src);
+    bool DstIsInternal = Binary->addressIsCode(Dst);
+    if (!SrcIsInternal)
+      Src = ExternalAddr;
+    if (!DstIsInternal)
+      Dst = ExternalAddr;
+    // Filter external-to-external case to reduce LBR trace size.
+    if (!SrcIsInternal && !DstIsInternal)
+      continue;
+
+    LBRStack.emplace_back(LBREntry(Src, Dst));
+  }
+  TraceIt.advance();
+  return !LBRStack.empty();
+}
+
+bool PerfScriptReader::extractCallstack(TraceStream &TraceIt,
+                                        SmallVectorImpl<uint64_t> &CallStack) {
+  // The raw format of call stack is like:
+  //            4005dc      # leaf frame
+  //	          400634
+  //	          400684      # root frame
+  // It's in bottom-up order with each frame in one line.
+
+  // Extract stack frames from sample
+  while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().starts_with(" 0x")) {
+    StringRef FrameStr = TraceIt.getCurrentLine().ltrim();
+    uint64_t FrameAddr = 0;
+    if (FrameStr.getAsInteger(16, FrameAddr)) {
+      // We might parse a non-perf sample line like empty line and comments,
+      // skip it
+      TraceIt.advance();
+      return false;
+    }
+    TraceIt.advance();
+
+    FrameAddr = Binary->canonicalizeVirtualAddress(FrameAddr);
+    // Currently intermixed frame from different binaries is not supported.
+    if (!Binary->addressIsCode(FrameAddr)) {
+      if (CallStack.empty())
+        NumLeafExternalFrame++;
+      // Push a special value(ExternalAddr) for the external frames so that
+      // unwinder can still work on this with artificial Call/Return branch.
+      // After unwinding, the context will be truncated for external frame.
+      // Also deduplicate the consecutive external addresses.
+      if (CallStack.empty() || CallStack.back() != ExternalAddr)
+        CallStack.emplace_back(ExternalAddr);
+      continue;
+    }
+
+    // We need to translate return address to call address for non-leaf frames.
+    if (!CallStack.empty()) {
+      auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr);
+      if (!CallAddr) {
+        // Stop at an invalid return address caused by bad unwinding. This could
+        // happen to frame-pointer-based unwinding and the callee functions that
+        // do not have the frame pointer chain set up.
+        InvalidReturnAddresses.insert(FrameAddr);
+        break;
+      }
+      FrameAddr = CallAddr;
+    }
+
+    CallStack.emplace_back(FrameAddr);
+  }
+
+  // Strip out the bottom external addr.
+  if (CallStack.size() > 1 && CallStack.back() == ExternalAddr)
+    CallStack.pop_back();
+
+  // Skip other unrelated line, find the next valid LBR line
+  // Note that even for empty call stack, we should skip the address at the
+  // bottom, otherwise the following pass may generate a truncated callstack
+  while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().starts_with(" 0x")) {
+    TraceIt.advance();
+  }
+  // Filter out broken stack sample. We may not have complete frame info
+  // if sample end up in prolog/epilog, the result is dangling context not
+  // connected to entry point. This should be relatively rare thus not much
+  // impact on overall profile quality. However we do want to filter them
+  // out to reduce the number of different calling contexts. One instance
+  // of such case - when sample landed in prolog/epilog, somehow stack
+  // walking will be broken in an unexpected way that higher frames will be
+  // missing.
+  return !CallStack.empty() &&
+         !Binary->addressInPrologEpilog(CallStack.front());
+}
+
+void PerfScriptReader::warnIfMissingMMap() {
+  if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) {
+    WithColor::warning() << "No relevant mmap event is matched for "
+                         << Binary->getName()
+                         << ", will use preferred address ("
+                         << format("0x%" PRIx64,
+                                   Binary->getPreferredBaseAddress())
+                         << ") as the base loading address!\n";
+    // Avoid redundant warning, only warn at the first unmatched sample.
+    Binary->setMissingMMapWarned(true);
+  }
+}
+
+void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
+  // The raw hybird sample started with call stack in FILO order and followed
+  // intermediately by LBR sample
+  // e.g.
+  // 	          4005dc    # call stack leaf
+  //	          400634
+  //	          400684    # call stack root
+  // 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+  //          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
+  //
+  std::shared_ptr<PerfSample> Sample = std::make_shared<PerfSample>();
+#ifndef NDEBUG
+  Sample->Linenum = TraceIt.getLineNumber();
+#endif
+  // Parsing call stack and populate into PerfSample.CallStack
+  if (!extractCallstack(TraceIt, Sample->CallStack)) {
+    // Skip the next LBR line matched current call stack
+    if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().starts_with(" 0x"))
+      TraceIt.advance();
+    return;
+  }
+
+  warnIfMissingMMap();
+
+  if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().starts_with(" 0x")) {
+    // Parsing LBR stack and populate into PerfSample.LBRStack
+    if (extractLBRStack(TraceIt, Sample->LBRStack)) {
+      if (IgnoreStackSamples) {
+        Sample->CallStack.clear();
+      } else {
+        // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
+        // ranges
+        Sample->CallStack.front() = Sample->LBRStack[0].Target;
+      }
+      // Record samples by aggregation
+      AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
+    }
+  } else {
+    // LBR sample is encoded in single line after stack sample
+    exitWithError("'Hybrid perf sample is corrupted, No LBR sample line");
+  }
+}
+
+void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) {
+  std::error_code EC;
+  raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithError(EC, Filename);
+  writeUnsymbolizedProfile(OS);
+}
+
+// Use ordered map to make the output deterministic
+using OrderedCounterForPrint = std::map<std::string, SampleCounter *>;
+
+void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) {
+  OrderedCounterForPrint OrderedCounters;
+  for (auto &CI : SampleCounters) {
+    OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second;
+  }
+
+  auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator,
+                             uint32_t Indent) {
+    OS.indent(Indent);
+    OS << Counter.size() << "\n";
+    for (auto &I : Counter) {
+      uint64_t Start = I.first.first;
+      uint64_t End = I.first.second;
+
+      if (UseOffset) {
+        if (UseLoadableSegmentAsBase) {
+          Start -= Binary->getFirstLoadableAddress();
+          End -= Binary->getFirstLoadableAddress();
+        } else {
+          Start -= Binary->getPreferredBaseAddress();
+          End -= Binary->getPreferredBaseAddress();
+        }
+      }
+
+      OS.indent(Indent);
+      OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":"
+         << I.second << "\n";
+    }
+  };
+
+  for (auto &CI : OrderedCounters) {
+    uint32_t Indent = 0;
+    if (ProfileIsCS) {
+      // Context string key
+      OS << "[" << CI.first << "]\n";
+      Indent = 2;
+    }
+
+    SampleCounter &Counter = *CI.second;
+    SCounterPrinter(Counter.RangeCounter, "-", Indent);
+    SCounterPrinter(Counter.BranchCounter, "->", Indent);
+  }
+}
+
+// Format of input:
+// number of entries in RangeCounter
+// from_1-to_1:count_1
+// from_2-to_2:count_2
+// ......
+// from_n-to_n:count_n
+// number of entries in BranchCounter
+// src_1->dst_1:count_1
+// src_2->dst_2:count_2
+// ......
+// src_n->dst_n:count_n
+void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt,
+                                                   SampleCounter &SCounters) {
+  auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) {
+    std::string Msg = TraceIt.isAtEoF()
+                          ? "Invalid raw profile!"
+                          : "Invalid raw profile at line " +
+                                Twine(TraceIt.getLineNumber()).str() + ": " +
+                                TraceIt.getCurrentLine().str();
+    exitWithError(Msg);
+  };
+  auto ReadNumber = [&](uint64_t &Num) {
+    if (TraceIt.isAtEoF())
+      exitWithErrorForTraceLine(TraceIt);
+    if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num))
+      exitWithErrorForTraceLine(TraceIt);
+    TraceIt.advance();
+  };
+
+  auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) {
+    uint64_t Num = 0;
+    ReadNumber(Num);
+    while (Num--) {
+      if (TraceIt.isAtEoF())
+        exitWithErrorForTraceLine(TraceIt);
+      StringRef Line = TraceIt.getCurrentLine().ltrim();
+
+      uint64_t Count = 0;
+      auto LineSplit = Line.split(":");
+      if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count))
+        exitWithErrorForTraceLine(TraceIt);
+
+      uint64_t Source = 0;
+      uint64_t Target = 0;
+      auto Range = LineSplit.first.split(Separator);
+      if (Range.second.empty() || Range.first.getAsInteger(16, Source) ||
+          Range.second.getAsInteger(16, Target))
+        exitWithErrorForTraceLine(TraceIt);
+
+      if (UseOffset) {
+        if (UseLoadableSegmentAsBase) {
+          Source += Binary->getFirstLoadableAddress();
+          Target += Binary->getFirstLoadableAddress();
+        } else {
+          Source += Binary->getPreferredBaseAddress();
+          Target += Binary->getPreferredBaseAddress();
+        }
+      }
+
+      Counter[{Source, Target}] += Count;
+      TraceIt.advance();
+    }
+  };
+
+  ReadCounter(SCounters.RangeCounter, "-");
+  ReadCounter(SCounters.BranchCounter, "->");
+}
+
+void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) {
+  TraceStream TraceIt(FileName);
+  while (!TraceIt.isAtEoF()) {
+    std::shared_ptr<StringBasedCtxKey> Key =
+        std::make_shared<StringBasedCtxKey>();
+    StringRef Line = TraceIt.getCurrentLine();
+    // Read context stack for CS profile.
+    if (Line.starts_with("[")) {
+      ProfileIsCS = true;
+      auto I = ContextStrSet.insert(Line.str());
+      SampleContext::createCtxVectorFromStr(*I.first, Key->Context);
+      TraceIt.advance();
+    }
+    auto Ret =
+        SampleCounters.emplace(Hashable<ContextKey>(Key), SampleCounter());
+    readSampleCounters(TraceIt, Ret.first->second);
+  }
+}
+
+void UnsymbolizedProfileReader::parsePerfTraces() {
+  readUnsymbolizedProfile(PerfTraceFile);
+}
+
+void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample,
+                                             uint64_t Repeat) {
+  SampleCounter &Counter = SampleCounters.begin()->second;
+  uint64_t EndAddress = 0;
+  for (const LBREntry &LBR : Sample->LBRStack) {
+    uint64_t SourceAddress = LBR.Source;
+    uint64_t TargetAddress = LBR.Target;
+
+    // Record the branch if its SourceAddress is external. It can be the case an
+    // external source call an internal function, later this branch will be used
+    // to generate the function's head sample.
+    if (Binary->addressIsCode(TargetAddress)) {
+      Counter.recordBranchCount(SourceAddress, TargetAddress, Repeat);
+    }
+
+    // If this not the first LBR, update the range count between TO of current
+    // LBR and FROM of next LBR.
+    uint64_t StartAddress = TargetAddress;
+    if (Binary->addressIsCode(StartAddress) &&
+        Binary->addressIsCode(EndAddress) &&
+        isValidFallThroughRange(StartAddress, EndAddress, Binary))
+      Counter.recordRangeCount(StartAddress, EndAddress, Repeat);
+    EndAddress = SourceAddress;
+  }
+}
+
+void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
+  std::shared_ptr<PerfSample> Sample = std::make_shared<PerfSample>();
+  // Parsing LBR stack and populate into PerfSample.LBRStack
+  if (extractLBRStack(TraceIt, Sample->LBRStack)) {
+    warnIfMissingMMap();
+    // Record LBR only samples by aggregation
+    AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
+  }
+}
+
+void PerfScriptReader::generateUnsymbolizedProfile() {
+  // There is no context for LBR only sample, so initialize one entry with
+  // fake "empty" context key.
+  assert(SampleCounters.empty() &&
+         "Sample counter map should be empty before raw profile generation");
+  std::shared_ptr<StringBasedCtxKey> Key =
+      std::make_shared<StringBasedCtxKey>();
+  SampleCounters.emplace(Hashable<ContextKey>(Key), SampleCounter());
+  for (const auto &Item : AggregatedSamples) {
+    const PerfSample *Sample = Item.first.getPtr();
+    computeCounterFromLBR(Sample, Item.second);
+  }
+}
+
+uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) {
+  // The aggregated count is optional, so do not skip the line and return 1 if
+  // it's unmatched
+  uint64_t Count = 1;
+  if (!TraceIt.getCurrentLine().getAsInteger(10, Count))
+    TraceIt.advance();
+  return Count;
+}
+
+void PerfScriptReader::parseSample(TraceStream &TraceIt) {
+  NumTotalSample++;
+  uint64_t Count = parseAggregatedCount(TraceIt);
+  assert(Count >= 1 && "Aggregated count should be >= 1!");
+  parseSample(TraceIt, Count);
+}
+
+bool PerfScriptReader::extractMMapEventForBinary(ProfiledBinary *Binary,
+                                                 StringRef Line,
+                                                 MMapEvent &MMap) {
+  // Parse a MMap2 line like:
+  //  PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0
+  //  08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so
+  constexpr static const char *const MMap2Pattern =
+      "PERF_RECORD_MMAP2 (-?[0-9]+)/[0-9]+: "
+      "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ "
+      "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)";
+  // Parse a MMap line like
+  // PERF_RECORD_MMAP -1/0: [0xffffffff81e00000(0x3e8fa000) @ \
+  //  0xffffffff81e00000]: x [kernel.kallsyms]_text
+  constexpr static const char *const MMapPattern =
+      "PERF_RECORD_MMAP (-?[0-9]+)/[0-9]+: "
+      "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ "
+      "(0x[a-f0-9]+|0)\\]: [-a-z]+ (.*)";
+  // Field 0 - whole line
+  // Field 1 - PID
+  // Field 2 - base address
+  // Field 3 - mmapped size
+  // Field 4 - page offset
+  // Field 5 - binary path
+  enum EventIndex {
+    WHOLE_LINE = 0,
+    PID = 1,
+    MMAPPED_ADDRESS = 2,
+    MMAPPED_SIZE = 3,
+    PAGE_OFFSET = 4,
+    BINARY_PATH = 5
+  };
+
+  bool R = false;
+  SmallVector<StringRef, 6> Fields;
+  if (Line.contains("PERF_RECORD_MMAP2 ")) {
+    Regex RegMmap2(MMap2Pattern);
+    R = RegMmap2.match(Line, &Fields);
+  } else if (Line.contains("PERF_RECORD_MMAP ")) {
+    Regex RegMmap(MMapPattern);
+    R = RegMmap.match(Line, &Fields);
+  } else
+    llvm_unreachable("unexpected MMAP event entry");
+
+  if (!R) {
+    std::string WarningMsg = "Cannot parse mmap event: " + Line.str() + " \n";
+    WithColor::warning() << WarningMsg;
+    return false;
+  }
+  long long MMapPID = 0;
+  getAsSignedInteger(Fields[PID], 10, MMapPID);
+  MMap.PID = MMapPID;
+  Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address);
+  Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size);
+  Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset);
+  MMap.BinaryPath = Fields[BINARY_PATH];
+  if (ShowMmapEvents) {
+    outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at "
+           << format("0x%" PRIx64 ":", MMap.Address) << " \n";
+  }
+
+  StringRef BinaryName = filename(MMap.BinaryPath, Binary->isCOFF());
+  if (Binary->isKernel()) {
+    return Binary->isKernelImageName(BinaryName);
+  }
+  return Binary->getName() == BinaryName;
+}
+
+void PerfScriptReader::parseMMapEvent(TraceStream &TraceIt) {
+  MMapEvent MMap;
+  if (extractMMapEventForBinary(Binary, TraceIt.getCurrentLine(), MMap))
+    updateBinaryAddress(MMap);
+  TraceIt.advance();
+}
+
+void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) {
+  if (isMMapEvent(TraceIt.getCurrentLine()))
+    parseMMapEvent(TraceIt);
+  else
+    parseSample(TraceIt);
+}
+
+void PerfScriptReader::parseAndAggregateTrace() {
+  // Trace line iterator
+  TraceStream TraceIt(PerfTraceFile);
+  while (!TraceIt.isAtEoF())
+    parseEventOrSample(TraceIt);
+}
+
+// A LBR sample is like:
+// 40062f 0x5c6313f/0x5c63170/P/-/-/0  0x5c630e7/0x5c63130/P/-/-/0 ...
+// A heuristic for fast detection by checking whether a
+// leading "  0x" and the '/' exist.
+bool PerfScriptReader::isLBRSample(StringRef Line) {
+  // Skip the leading instruction pointer
+  SmallVector<StringRef, 32> Records;
+  Line.trim().split(Records, " ", 2, false);
+  if (Records.size() < 2)
+    return false;
+  if (Records[1].starts_with("0x") && Records[1].contains('/'))
+    return true;
+  return false;
+}
+
+bool PerfScriptReader::isMMapEvent(StringRef Line) {
+  // Short cut to avoid string find is possible.
+  if (Line.empty() || Line.size() < 50)
+    return false;
+
+  if (std::isdigit(Line[0]))
+    return false;
+
+  // PERF_RECORD_MMAP2 or PERF_RECORD_MMAP does not appear at the beginning of
+  // the line for ` perf script  --show-mmap-events  -i ...`
+  return Line.contains("PERF_RECORD_MMAP");
+}
+
+// The raw hybird sample is like
+// e.g.
+// 	          4005dc    # call stack leaf
+//	          400634
+//	          400684    # call stack root
+// 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+//          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
+// Determine the perfscript contains hybrid samples(call stack + LBRs) by
+// checking whether there is a non-empty call stack immediately followed by
+// a LBR sample
+PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) {
+  TraceStream TraceIt(FileName);
+  uint64_t FrameAddr = 0;
+  while (!TraceIt.isAtEoF()) {
+    // Skip the aggregated count
+    if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr))
+      TraceIt.advance();
+
+    // Detect sample with call stack
+    int32_t Count = 0;
+    while (!TraceIt.isAtEoF() &&
+           !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) {
+      Count++;
+      TraceIt.advance();
+    }
+    if (!TraceIt.isAtEoF()) {
+      if (isLBRSample(TraceIt.getCurrentLine())) {
+        if (Count > 0)
+          return PerfContent::LBRStack;
+        else
+          return PerfContent::LBR;
+      }
+      TraceIt.advance();
+    }
+  }
+
+  exitWithError("Invalid perf script input!");
+  return PerfContent::UnknownContent;
+}
+
+void HybridPerfReader::generateUnsymbolizedProfile() {
+  ProfileIsCS = !IgnoreStackSamples;
+  if (ProfileIsCS)
+    unwindSamples();
+  else
+    PerfScriptReader::generateUnsymbolizedProfile();
+}
+
+void PerfScriptReader::warnTruncatedStack() {
+  if (ShowDetailedWarning) {
+    for (auto Address : InvalidReturnAddresses) {
+      WithColor::warning()
+          << "Truncated stack sample due to invalid return address at "
+          << format("0x%" PRIx64, Address)
+          << ", likely caused by frame pointer omission\n";
+    }
+  }
+  emitWarningSummary(
+      InvalidReturnAddresses.size(), AggregatedSamples.size(),
+      "of truncated stack samples due to invalid return address, "
+      "likely caused by frame pointer omission.");
+}
+
+void PerfScriptReader::warnInvalidRange() {
+  std::unordered_map<std::pair<uint64_t, uint64_t>, uint64_t,
+                     pair_hash<uint64_t, uint64_t>>
+      Ranges;
+
+  for (const auto &Item : AggregatedSamples) {
+    const PerfSample *Sample = Item.first.getPtr();
+    uint64_t Count = Item.second;
+    uint64_t EndAddress = 0;
+    for (const LBREntry &LBR : Sample->LBRStack) {
+      uint64_t SourceAddress = LBR.Source;
+      uint64_t StartAddress = LBR.Target;
+      if (EndAddress != 0)
+        Ranges[{StartAddress, EndAddress}] += Count;
+      EndAddress = SourceAddress;
+    }
+  }
+
+  if (Ranges.empty()) {
+    WithColor::warning() << "No samples in perf script!\n";
+    return;
+  }
+
+  auto WarnInvalidRange = [&](uint64_t StartAddress, uint64_t EndAddress,
+                              StringRef Msg) {
+    if (!ShowDetailedWarning)
+      return;
+    WithColor::warning() << "[" << format("%8" PRIx64, StartAddress) << ","
+                         << format("%8" PRIx64, EndAddress) << "]: " << Msg
+                         << "\n";
+  };
+
+  const char *EndNotBoundaryMsg = "Range is not on instruction boundary, "
+                                  "likely due to profile and binary mismatch.";
+  const char *DanglingRangeMsg = "Range does not belong to any functions, "
+                                 "likely from PLT, .init or .fini section.";
+  const char *RangeCrossFuncMsg =
+      "Fall through range should not cross function boundaries, likely due to "
+      "profile and binary mismatch.";
+  const char *BogusRangeMsg = "Range start is after or too far from range end.";
+
+  uint64_t TotalRangeNum = 0;
+  uint64_t InstNotBoundary = 0;
+  uint64_t UnmatchedRange = 0;
+  uint64_t RangeCrossFunc = 0;
+  uint64_t BogusRange = 0;
+
+  for (auto &I : Ranges) {
+    uint64_t StartAddress = I.first.first;
+    uint64_t EndAddress = I.first.second;
+    TotalRangeNum += I.second;
+
+    if (!Binary->addressIsCode(StartAddress) &&
+        !Binary->addressIsCode(EndAddress))
+      continue;
+
+    if (!Binary->addressIsCode(StartAddress) ||
+        !Binary->addressIsTransfer(EndAddress)) {
+      InstNotBoundary += I.second;
+      WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg);
+    }
+
+    auto *FRange = Binary->findFuncRange(StartAddress);
+    if (!FRange) {
+      UnmatchedRange += I.second;
+      WarnInvalidRange(StartAddress, EndAddress, DanglingRangeMsg);
+      continue;
+    }
+
+    if (EndAddress >= FRange->EndAddress) {
+      RangeCrossFunc += I.second;
+      WarnInvalidRange(StartAddress, EndAddress, RangeCrossFuncMsg);
+    }
+
+    if (Binary->addressIsCode(StartAddress) &&
+        Binary->addressIsCode(EndAddress) &&
+        !isValidFallThroughRange(StartAddress, EndAddress, Binary)) {
+      BogusRange += I.second;
+      WarnInvalidRange(StartAddress, EndAddress, BogusRangeMsg);
+    }
+  }
+
+  emitWarningSummary(
+      InstNotBoundary, TotalRangeNum,
+      "of samples are from ranges that are not on instruction boundary.");
+  emitWarningSummary(
+      UnmatchedRange, TotalRangeNum,
+      "of samples are from ranges that do not belong to any functions.");
+  emitWarningSummary(
+      RangeCrossFunc, TotalRangeNum,
+      "of samples are from ranges that do cross function boundaries.");
+  emitWarningSummary(
+      BogusRange, TotalRangeNum,
+      "of samples are from ranges that have range start after or too far from "
+      "range end acrossing the unconditinal jmp.");
+}
+
+void PerfScriptReader::parsePerfTraces() {
+  // Parse perf traces and do aggregation.
+  parseAndAggregateTrace();
+  if (Binary->isKernel() && !Binary->getIsLoadedByMMap()) {
+    exitWithError(
+        "Kernel is requested, but no kernel is found in mmap events.");
+  }
+
+  emitWarningSummary(NumLeafExternalFrame, NumTotalSample,
+                     "of samples have leaf external frame in call stack.");
+  emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample,
+                     "of samples have leading external LBR.");
+
+  // Generate unsymbolized profile.
+  warnTruncatedStack();
+  warnInvalidRange();
+  generateUnsymbolizedProfile();
+  AggregatedSamples.clear();
+
+  if (SkipSymbolization)
+    writeUnsymbolizedProfile(OutputFilename);
+}
+
+SmallVector<CleanupInstaller, 2> PerfScriptReader::TempFileCleanups;
+
+} // end namespace sampleprof
+} // end namespace llvm
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.h b/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.h
new file mode 100644
index 0000000000..4b3ac8f569
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/PerfReader.h
@@ -0,0 +1,746 @@
+//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
+#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
+#include "ErrorHandling.h"
+#include "ProfiledBinary.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
+#include <cstdint>
+#include <fstream>
+#include <map>
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+
+class CleanupInstaller;
+
+namespace sampleprof {
+
+// Stream based trace line iterator
+class TraceStream {
+  std::string CurrentLine;
+  std::ifstream Fin;
+  bool IsAtEoF = false;
+  uint64_t LineNumber = 0;
+
+public:
+  TraceStream(StringRef Filename) : Fin(Filename.str()) {
+    if (!Fin.good())
+      exitWithError("Error read input perf script file", Filename);
+    advance();
+  }
+
+  StringRef getCurrentLine() {
+    assert(!IsAtEoF && "Line iterator reaches the End-of-File!");
+    return CurrentLine;
+  }
+
+  uint64_t getLineNumber() { return LineNumber; }
+
+  bool isAtEoF() { return IsAtEoF; }
+
+  // Read the next line
+  void advance() {
+    if (!std::getline(Fin, CurrentLine)) {
+      IsAtEoF = true;
+      return;
+    }
+    LineNumber++;
+  }
+};
+
+// The type of input format.
+enum PerfFormat {
+  UnknownFormat = 0,
+  PerfData = 1,            // Raw linux perf.data.
+  PerfScript = 2,          // Perf script create by `perf script` command.
+  UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen.
+
+};
+
+// The type of perfscript content.
+enum PerfContent {
+  UnknownContent = 0,
+  LBR = 1,      // Only LBR sample.
+  LBRStack = 2, // Hybrid sample including call stack and LBR stack.
+};
+
+struct PerfInputFile {
+  std::string InputFile;
+  PerfFormat Format = PerfFormat::UnknownFormat;
+  PerfContent Content = PerfContent::UnknownContent;
+};
+
+// The parsed LBR sample entry.
+struct LBREntry {
+  uint64_t Source = 0;
+  uint64_t Target = 0;
+  LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {}
+
+#ifndef NDEBUG
+  void print() const {
+    dbgs() << "from " << format("%#010x", Source) << " to "
+           << format("%#010x", Target);
+  }
+#endif
+};
+
+#ifndef NDEBUG
+static inline void printLBRStack(const SmallVectorImpl<LBREntry> &LBRStack) {
+  for (size_t I = 0; I < LBRStack.size(); I++) {
+    dbgs() << "[" << I << "] ";
+    LBRStack[I].print();
+    dbgs() << "\n";
+  }
+}
+
+static inline void printCallStack(const SmallVectorImpl<uint64_t> &CallStack) {
+  for (size_t I = 0; I < CallStack.size(); I++) {
+    dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n";
+  }
+}
+#endif
+
+// Hash interface for generic data of type T
+// Data should implement a \fn getHashCode and a \fn isEqual
+// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
+// i.e we explicitly calculate hash of derived class, assign to base class's
+// HashCode. This also provides the flexibility for calculating the hash code
+// incrementally(like rolling hash) during frame stack unwinding since unwinding
+// only changes the leaf of frame stack. \fn isEqual is a virtual function,
+// which will have perf overhead. In the future, if we redesign a better hash
+// function, then we can just skip this or switch to non-virtual function(like
+// just ignore comparison if hash conflicts probabilities is low)
+template <class T> class Hashable {
+public:
+  std::shared_ptr<T> Data;
+  Hashable(const std::shared_ptr<T> &D) : Data(D) {}
+
+  // Hash code generation
+  struct Hash {
+    uint64_t operator()(const Hashable<T> &Key) const {
+      // Don't make it virtual for getHashCode
+      uint64_t Hash = Key.Data->getHashCode();
+      assert(Hash && "Should generate HashCode for it!");
+      return Hash;
+    }
+  };
+
+  // Hash equal
+  struct Equal {
+    bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const {
+      // Precisely compare the data, vtable will have overhead.
+      return LHS.Data->isEqual(RHS.Data.get());
+    }
+  };
+
+  T *getPtr() const { return Data.get(); }
+};
+
+struct PerfSample {
+  // LBR stack recorded in FIFO order.
+  SmallVector<LBREntry, 16> LBRStack;
+  // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile
+  // generation
+  SmallVector<uint64_t, 16> CallStack;
+
+  virtual ~PerfSample() = default;
+  uint64_t getHashCode() const {
+    // Use simple DJB2 hash
+    auto HashCombine = [](uint64_t H, uint64_t V) {
+      return ((H << 5) + H) + V;
+    };
+    uint64_t Hash = 5381;
+    for (const auto &Value : CallStack) {
+      Hash = HashCombine(Hash, Value);
+    }
+    for (const auto &Entry : LBRStack) {
+      Hash = HashCombine(Hash, Entry.Source);
+      Hash = HashCombine(Hash, Entry.Target);
+    }
+    return Hash;
+  }
+
+  bool isEqual(const PerfSample *Other) const {
+    const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack;
+    const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
+
+    if (CallStack.size() != OtherCallStack.size() ||
+        LBRStack.size() != OtherLBRStack.size())
+      return false;
+
+    if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin()))
+      return false;
+
+    for (size_t I = 0; I < OtherLBRStack.size(); I++) {
+      if (LBRStack[I].Source != OtherLBRStack[I].Source ||
+          LBRStack[I].Target != OtherLBRStack[I].Target)
+        return false;
+    }
+    return true;
+  }
+
+#ifndef NDEBUG
+  uint64_t Linenum = 0;
+
+  void print() const {
+    dbgs() << "Line " << Linenum << "\n";
+    dbgs() << "LBR stack\n";
+    printLBRStack(LBRStack);
+    dbgs() << "Call stack\n";
+    printCallStack(CallStack);
+  }
+#endif
+};
+// After parsing the sample, we record the samples by aggregating them
+// into this counter. The key stores the sample data and the value is
+// the sample repeat times.
+using AggregatedCounter =
+    std::unordered_map<Hashable<PerfSample>, uint64_t,
+                       Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
+
+using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
+
+inline bool isValidFallThroughRange(uint64_t Start, uint64_t End,
+                                    ProfiledBinary *Binary) {
+  // Start bigger than End is considered invalid.
+  // LBR ranges cross the unconditional jmp are also assumed invalid.
+  // It's found that perf data may contain duplicate LBR entries that could form
+  // a range that does not reflect real execution flow on some Intel targets,
+  // e.g. Skylake. Such ranges are ususally very long. Exclude them since there
+  // cannot be a linear execution range that spans over unconditional jmp.
+  return Start <= End && !Binary->rangeCrossUncondBranch(Start, End);
+}
+
+// The state for the unwinder, it doesn't hold the data but only keep the
+// pointer/index of the data, While unwinding, the CallStack is changed
+// dynamicially and will be recorded as the context of the sample
+struct UnwindState {
+  // Profiled binary that current frame address belongs to
+  const ProfiledBinary *Binary;
+  // Call stack trie node
+  struct ProfiledFrame {
+    const uint64_t Address = DummyRoot;
+    ProfiledFrame *Parent;
+    SampleVector RangeSamples;
+    SampleVector BranchSamples;
+    std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children;
+
+    ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr)
+        : Address(Addr), Parent(P) {}
+    ProfiledFrame *getOrCreateChildFrame(uint64_t Address) {
+      assert(Address && "Address can't be zero!");
+      auto Ret = Children.emplace(
+          Address, std::make_unique<ProfiledFrame>(Address, this));
+      return Ret.first->second.get();
+    }
+    void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) {
+      RangeSamples.emplace_back(std::make_tuple(Start, End, Count));
+    }
+    void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
+      BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
+    }
+    bool isDummyRoot() { return Address == DummyRoot; }
+    bool isExternalFrame() { return Address == ExternalAddr; }
+    bool isLeafFrame() { return Children.empty(); }
+  };
+
+  ProfiledFrame DummyTrieRoot;
+  ProfiledFrame *CurrentLeafFrame;
+  // Used to fall through the LBR stack
+  uint32_t LBRIndex = 0;
+  // Reference to PerfSample.LBRStack
+  const SmallVector<LBREntry, 16> &LBRStack;
+  // Used to iterate the address range
+  InstructionPointer InstPtr;
+  // Indicate whether unwinding is currently in a bad state which requires to
+  // skip all subsequent unwinding.
+  bool Invalid = false;
+  UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary)
+      : Binary(Binary), LBRStack(Sample->LBRStack),
+        InstPtr(Binary, Sample->CallStack.front()) {
+    initFrameTrie(Sample->CallStack);
+  }
+
+  bool validateInitialState() {
+    uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
+    uint64_t LeafAddr = CurrentLeafFrame->Address;
+    assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) &&
+           "External leading LBR should match the leaf frame.");
+
+    // When we take a stack sample, ideally the sampling distance between the
+    // leaf IP of stack and the last LBR target shouldn't be very large.
+    // Use a heuristic size (0x100) to filter out broken records.
+    if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) {
+      WithColor::warning() << "Bogus trace: stack tip = "
+                           << format("%#010x", LeafAddr)
+                           << ", LBR tip = " << format("%#010x\n", LBRLeaf);
+      return false;
+    }
+    return true;
+  }
+
+  void checkStateConsistency() {
+    assert(InstPtr.Address == CurrentLeafFrame->Address &&
+           "IP should align with context leaf");
+  }
+
+  void setInvalid() { Invalid = true; }
+  bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
+  uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
+  uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
+  const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
+  bool IsLastLBR() const { return LBRIndex == 0; }
+  bool getLBRStackSize() const { return LBRStack.size(); }
+  void advanceLBR() { LBRIndex++; }
+  ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
+
+  void pushFrame(uint64_t Address) {
+    CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address);
+  }
+
+  void switchToFrame(uint64_t Address) {
+    if (CurrentLeafFrame->Address == Address)
+      return;
+    CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address);
+  }
+
+  void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; }
+
+  void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; }
+
+  void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) {
+    ProfiledFrame *Cur = &DummyTrieRoot;
+    for (auto Address : reverse(CallStack)) {
+      Cur = Cur->getOrCreateChildFrame(Address);
+    }
+    CurrentLeafFrame = Cur;
+  }
+
+  ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; }
+};
+
+// Base class for sample counter key with context
+struct ContextKey {
+  uint64_t HashCode = 0;
+  virtual ~ContextKey() = default;
+  uint64_t getHashCode() {
+    if (HashCode == 0)
+      genHashCode();
+    return HashCode;
+  }
+  virtual void genHashCode() = 0;
+  virtual bool isEqual(const ContextKey *K) const {
+    return HashCode == K->HashCode;
+  };
+
+  // Utilities for LLVM-style RTTI
+  enum ContextKind { CK_StringBased, CK_AddrBased };
+  const ContextKind Kind;
+  ContextKind getKind() const { return Kind; }
+  ContextKey(ContextKind K) : Kind(K){};
+};
+
+// String based context id
+struct StringBasedCtxKey : public ContextKey {
+  SampleContextFrameVector Context;
+
+  bool WasLeafInlined;
+  StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){};
+  static bool classof(const ContextKey *K) {
+    return K->getKind() == CK_StringBased;
+  }
+
+  bool isEqual(const ContextKey *K) const override {
+    const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K);
+    return Context == Other->Context;
+  }
+
+  void genHashCode() override {
+    HashCode = hash_value(SampleContextFrames(Context));
+  }
+};
+
+// Address-based context id
+struct AddrBasedCtxKey : public ContextKey {
+  SmallVector<uint64_t, 16> Context;
+
+  bool WasLeafInlined;
+  AddrBasedCtxKey() : ContextKey(CK_AddrBased), WasLeafInlined(false){};
+  static bool classof(const ContextKey *K) {
+    return K->getKind() == CK_AddrBased;
+  }
+
+  bool isEqual(const ContextKey *K) const override {
+    const AddrBasedCtxKey *Other = dyn_cast<AddrBasedCtxKey>(K);
+    return Context == Other->Context;
+  }
+
+  void genHashCode() override { HashCode = hash_combine_range(Context); }
+};
+
+// The counter of branch samples for one function indexed by the branch,
+// which is represented as the source and target offset pair.
+using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
+// The counter of range samples for one function indexed by the range,
+// which is represented as the start and end offset pair.
+using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
+// Wrapper for sample counters including range counter and branch counter
+struct SampleCounter {
+  RangeSample RangeCounter;
+  BranchSample BranchCounter;
+
+  void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
+    assert(Start <= End && "Invalid instruction range");
+    RangeCounter[{Start, End}] += Repeat;
+  }
+  void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
+    BranchCounter[{Source, Target}] += Repeat;
+  }
+};
+
+// Sample counter with context to support context-sensitive profile
+using ContextSampleCounterMap =
+    std::unordered_map<Hashable<ContextKey>, SampleCounter,
+                       Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
+
+struct FrameStack {
+  SmallVector<uint64_t, 16> Stack;
+  ProfiledBinary *Binary;
+  FrameStack(ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    assert(!Cur->isExternalFrame() &&
+           "External frame's not expected for context stack.");
+    Stack.push_back(Cur->Address);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  std::shared_ptr<StringBasedCtxKey> getContextKey();
+};
+
+struct AddressStack {
+  SmallVector<uint64_t, 16> Stack;
+  ProfiledBinary *Binary;
+  AddressStack(ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    assert(!Cur->isExternalFrame() &&
+           "External frame's not expected for context stack.");
+    Stack.push_back(Cur->Address);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  std::shared_ptr<AddrBasedCtxKey> getContextKey();
+};
+
+/*
+As in hybrid sample we have a group of LBRs and the most recent sampling call
+stack, we can walk through those LBRs to infer more call stacks which would be
+used as context for profile. VirtualUnwinder is the class to do the call stack
+unwinding based on LBR state. Two types of unwinding are processd here:
+1) LBR unwinding and 2) linear range unwinding.
+Specifically, for each LBR entry(can be classified into call, return, regular
+branch), LBR unwinding will replay the operation by pushing, popping or
+switching leaf frame towards the call stack and since the initial call stack
+is most recently sampled, the replay should be in anti-execution order, i.e. for
+the regular case, pop the call stack when LBR is call, push frame on call stack
+when LBR is return. After each LBR processed, it also needs to align with the
+next LBR by going through instructions from previous LBR's target to current
+LBR's source, which is the linear unwinding. As instruction from linear range
+can come from different function by inlining, linear unwinding will do the range
+splitting and record counters by the range with same inline context. Over those
+unwinding process we will record each call stack as context id and LBR/linear
+range as sample counter for further CS profile generation.
+*/
+class VirtualUnwinder {
+public:
+  VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B)
+      : CtxCounterMap(Counter), Binary(B) {}
+  bool unwind(const PerfSample *Sample, uint64_t Repeat);
+  std::set<uint64_t> &getUntrackedCallsites() { return UntrackedCallsites; }
+
+  uint64_t NumTotalBranches = 0;
+  uint64_t NumExtCallBranch = 0;
+  uint64_t NumMissingExternalFrame = 0;
+  uint64_t NumMismatchedProEpiBranch = 0;
+  uint64_t NumMismatchedExtCallBranch = 0;
+  uint64_t NumUnpairedExtAddr = 0;
+  uint64_t NumPairedExtAddr = 0;
+
+private:
+  bool isSourceExternal(UnwindState &State) const {
+    return State.getCurrentLBRSource() == ExternalAddr;
+  }
+
+  bool isTargetExternal(UnwindState &State) const {
+    return State.getCurrentLBRTarget() == ExternalAddr;
+  }
+
+  // Determine whether the return source is from external code by checking if
+  // the target's the next inst is a call inst.
+  bool isReturnFromExternal(UnwindState &State) const {
+    return isSourceExternal(State) &&
+           (Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) != 0);
+  }
+
+  // If the source is external address but it's not the `return` case, treat it
+  // as a call from external.
+  bool isCallFromExternal(UnwindState &State) const {
+    return isSourceExternal(State) &&
+           Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) == 0;
+  }
+
+  bool isCallState(UnwindState &State) const {
+    // The tail call frame is always missing here in stack sample, we will
+    // use a specific tail call tracker to infer it.
+    if (!isValidState(State))
+      return false;
+
+    if (Binary->addressIsCall(State.getCurrentLBRSource()))
+      return true;
+
+    return isCallFromExternal(State);
+  }
+
+  bool isReturnState(UnwindState &State) const {
+    if (!isValidState(State))
+      return false;
+
+    // Simply check addressIsReturn, as ret is always reliable, both for
+    // regular call and tail call.
+    if (Binary->addressIsReturn(State.getCurrentLBRSource()))
+      return true;
+
+    return isReturnFromExternal(State);
+  }
+
+  bool isValidState(UnwindState &State) const { return !State.Invalid; }
+
+  void unwindCall(UnwindState &State);
+  void unwindLinear(UnwindState &State, uint64_t Repeat);
+  void unwindReturn(UnwindState &State);
+  void unwindBranch(UnwindState &State);
+
+  template <typename T>
+  void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
+  // Collect each samples on trie node by DFS traversal
+  template <typename T>
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack);
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur);
+
+  void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
+                        uint64_t Repeat);
+  void recordBranchCount(const LBREntry &Branch, UnwindState &State,
+                         uint64_t Repeat);
+
+  ContextSampleCounterMap *CtxCounterMap;
+  // Profiled binary that current frame address belongs to
+  ProfiledBinary *Binary;
+  // Keep track of all untracked callsites
+  std::set<uint64_t> UntrackedCallsites;
+};
+
+// Read perf trace to parse the events and samples.
+class PerfReaderBase {
+public:
+  PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace)
+      : Binary(B), PerfTraceFile(PerfTrace) {
+    // Initialize the base address to preferred address.
+    Binary->setBaseAddress(Binary->getPreferredBaseAddress());
+  };
+  virtual ~PerfReaderBase() = default;
+  static std::unique_ptr<PerfReaderBase>
+  create(ProfiledBinary *Binary, PerfInputFile &PerfInput,
+         std::optional<int32_t> PIDFilter);
+
+  // Entry of the reader to parse multiple perf traces
+  virtual void parsePerfTraces() = 0;
+  const ContextSampleCounterMap &getSampleCounters() const {
+    return SampleCounters;
+  }
+  bool profileIsCS() { return ProfileIsCS; }
+
+protected:
+  ProfiledBinary *Binary = nullptr;
+  StringRef PerfTraceFile;
+
+  ContextSampleCounterMap SampleCounters;
+  bool ProfileIsCS = false;
+
+  uint64_t NumTotalSample = 0;
+  uint64_t NumLeafExternalFrame = 0;
+  uint64_t NumLeadingOutgoingLBR = 0;
+};
+
+// Read perf script to parse the events and samples.
+class PerfScriptReader : public PerfReaderBase {
+public:
+  PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace,
+                   std::optional<int32_t> PID)
+      : PerfReaderBase(B, PerfTrace), PIDFilter(PID) {};
+
+  // Entry of the reader to parse multiple perf traces
+  void parsePerfTraces() override;
+  // Generate perf script from perf data
+  static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary,
+                                              bool SkipPID, PerfInputFile &File,
+                                              std::optional<int32_t> PIDFilter);
+  // Extract perf script type by peaking at the input
+  static PerfContent checkPerfScriptType(StringRef FileName);
+
+  // Cleanup installers for temporary files created by perf script command.
+  // Those files will be automatically removed when running destructor or
+  // receiving signals.
+  static SmallVector<CleanupInstaller, 2> TempFileCleanups;
+
+protected:
+  // The parsed MMap event
+  struct MMapEvent {
+    int64_t PID = 0;
+    uint64_t Address = 0;
+    uint64_t Size = 0;
+    uint64_t Offset = 0;
+    StringRef BinaryPath;
+  };
+
+  // Check whether a given line is LBR sample
+  static bool isLBRSample(StringRef Line);
+  // Check whether a given line is MMAP event
+  static bool isMMapEvent(StringRef Line);
+  // Parse a single line of a PERF_RECORD_MMAP event looking for a
+  // mapping between the binary name and its memory layout.
+  static bool extractMMapEventForBinary(ProfiledBinary *Binary, StringRef Line,
+                                        MMapEvent &MMap);
+  // Update base address based on mmap events
+  void updateBinaryAddress(const MMapEvent &Event);
+  // Parse mmap event and update binary address
+  void parseMMapEvent(TraceStream &TraceIt);
+  // Parse perf events/samples and do aggregation
+  void parseAndAggregateTrace();
+  // Parse either an MMAP event or a perf sample
+  void parseEventOrSample(TraceStream &TraceIt);
+  // Warn if the relevant mmap event is missing.
+  void warnIfMissingMMap();
+  // Emit accumulate warnings.
+  void warnTruncatedStack();
+  // Warn if range is invalid.
+  void warnInvalidRange();
+  // Extract call stack from the perf trace lines
+  bool extractCallstack(TraceStream &TraceIt,
+                        SmallVectorImpl<uint64_t> &CallStack);
+  // Extract LBR stack from one perf trace line
+  bool extractLBRStack(TraceStream &TraceIt,
+                       SmallVectorImpl<LBREntry> &LBRStack);
+  uint64_t parseAggregatedCount(TraceStream &TraceIt);
+  // Parse one sample from multiple perf lines, override this for different
+  // sample type
+  void parseSample(TraceStream &TraceIt);
+  // An aggregated count is given to indicate how many times the sample is
+  // repeated.
+  virtual void parseSample(TraceStream &TraceIt, uint64_t Count){};
+  void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat);
+  // Post process the profile after trace aggregation, we will do simple range
+  // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample).
+  virtual void generateUnsymbolizedProfile();
+  void writeUnsymbolizedProfile(StringRef Filename);
+  void writeUnsymbolizedProfile(raw_fd_ostream &OS);
+
+  // Samples with the repeating time generated by the perf reader
+  AggregatedCounter AggregatedSamples;
+  // Keep track of all invalid return addresses
+  std::set<uint64_t> InvalidReturnAddresses;
+  // PID for the process of interest
+  std::optional<int32_t> PIDFilter;
+};
+
+/*
+  The reader of LBR only perf script.
+  A typical LBR sample is like:
+    40062f 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+          ... 0x4005c8/0x4005dc/P/-/-/0
+*/
+class LBRPerfReader : public PerfScriptReader {
+public:
+  LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace,
+                std::optional<int32_t> PID)
+      : PerfScriptReader(Binary, PerfTrace, PID) {};
+  // Parse the LBR only sample.
+  void parseSample(TraceStream &TraceIt, uint64_t Count) override;
+};
+
+/*
+  Hybrid perf script includes a group of hybrid samples(LBRs + call stack),
+  which is used to generate CS profile. An example of hybrid sample:
+    4005dc    # call stack leaf
+    400634
+    400684    # call stack root
+    0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
+*/
+class HybridPerfReader : public PerfScriptReader {
+public:
+  HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace,
+                   std::optional<int32_t> PID)
+      : PerfScriptReader(Binary, PerfTrace, PID) {};
+  // Parse the hybrid sample including the call and LBR line
+  void parseSample(TraceStream &TraceIt, uint64_t Count) override;
+  void generateUnsymbolizedProfile() override;
+
+private:
+  // Unwind the hybrid samples after aggregration
+  void unwindSamples();
+};
+
+/*
+   Format of unsymbolized profile:
+
+    [frame1 @ frame2 @ ...]  # If it's a CS profile
+      number of entries in RangeCounter
+      from_1-to_1:count_1
+      from_2-to_2:count_2
+      ......
+      from_n-to_n:count_n
+      number of entries in BranchCounter
+      src_1->dst_1:count_1
+      src_2->dst_2:count_2
+      ......
+      src_n->dst_n:count_n
+    [frame1 @ frame2 @ ...]  # Next context
+      ......
+
+Note that non-CS profile doesn't have the empty `[]` context.
+*/
+class UnsymbolizedProfileReader : public PerfReaderBase {
+public:
+  UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace)
+      : PerfReaderBase(Binary, PerfTrace){};
+  void parsePerfTraces() override;
+
+private:
+  void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters);
+  void readUnsymbolizedProfile(StringRef Filename);
+
+  std::unordered_set<std::string> ContextStrSet;
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.cpp
new file mode 100644
index 0000000000..db686c3b59
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.cpp
@@ -0,0 +1,1371 @@
+//===-- ProfileGenerator.cpp - Profile Generator  ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "ProfileGenerator.h"
+#include "ErrorHandling.h"
+#include "MissingFrameInferrer.h"
+#include "PerfReader.h"
+#include "ProfiledBinary.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include <algorithm>
+#include <float.h>
+#include <unordered_set>
+#include <utility>
+
+cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                    cl::Required,
+                                    cl::desc("Output profile file"));
+static cl::alias OutputA("o", cl::desc("Alias for --output"),
+                         cl::aliasopt(OutputFilename));
+
+static cl::opt<SampleProfileFormat> OutputFormat(
+    "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary),
+    cl::values(
+        clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
+        clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
+        clEnumValN(SPF_Text, "text", "Text encoding"),
+        clEnumValN(SPF_GCC, "gcc",
+                   "GCC encoding (only meaningful for -sample)")));
+
+static cl::opt<bool> UseMD5(
+    "use-md5", cl::Hidden,
+    cl::desc("Use md5 to represent function names in the output profile (only "
+             "meaningful for -extbinary)"));
+
+static cl::opt<bool> PopulateProfileSymbolList(
+    "populate-profile-symbol-list", cl::init(false), cl::Hidden,
+    cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
+
+static cl::opt<bool> FillZeroForAllFuncs(
+    "fill-zero-for-all-funcs", cl::init(false), cl::Hidden,
+    cl::desc("Attribute all functions' range with zero count "
+             "even it's not hit by any samples."));
+
+static cl::opt<int32_t, true> RecursionCompression(
+    "compress-recursion",
+    cl::desc("Compressing recursion by deduplicating adjacent frame "
+             "sequences up to the specified size. -1 means no size limit."),
+    cl::Hidden,
+    cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
+
+static cl::opt<bool>
+    TrimColdProfile("trim-cold-profile",
+                    cl::desc("If the total count of the profile is smaller "
+                             "than threshold, it will be trimmed."));
+
+static cl::opt<bool> CSProfMergeColdContext(
+    "csprof-merge-cold-context", cl::init(true),
+    cl::desc("If the total count of context profile is smaller than "
+             "the threshold, it will be merged into context-less base "
+             "profile."));
+
+static cl::opt<uint32_t> CSProfMaxColdContextDepth(
+    "csprof-max-cold-context-depth", cl::init(1),
+    cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
+             "context-less base profile"));
+
+static cl::opt<int, true> CSProfMaxContextDepth(
+    "csprof-max-context-depth",
+    cl::desc("Keep the last K contexts while merging profile. -1 means no "
+             "depth limit."),
+    cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
+
+static cl::opt<double> ProfileDensityThreshold(
+    "profile-density-threshold", llvm::cl::init(50),
+    llvm::cl::desc("If the profile density is below the given threshold, it "
+                   "will be suggested to increase the sampling rate."),
+    llvm::cl::Optional);
+static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
+                                 llvm::cl::desc("show profile density details"),
+                                 llvm::cl::Optional);
+static cl::opt<int> ProfileDensityCutOffHot(
+    "profile-density-cutoff-hot", llvm::cl::init(990000),
+    llvm::cl::desc("Total samples cutoff for functions used to calculate "
+                   "profile density."));
+
+static cl::opt<bool> UpdateTotalSamples(
+    "update-total-samples", llvm::cl::init(false),
+    llvm::cl::desc(
+        "Update total samples by accumulating all its body samples."),
+    llvm::cl::Optional);
+
+static cl::opt<bool> GenCSNestedProfile(
+    "gen-cs-nested-profile", cl::Hidden, cl::init(true),
+    cl::desc("Generate nested function profiles for CSSPGO"));
+
+cl::opt<bool> InferMissingFrames(
+    "infer-missing-frames", llvm::cl::init(true),
+    llvm::cl::desc(
+        "Infer missing call frames due to compiler tail call elimination."),
+    llvm::cl::Optional);
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+
+namespace sampleprof {
+
+// Initialize the MaxCompressionSize to -1 which means no size limit
+int32_t CSProfileGenerator::MaxCompressionSize = -1;
+
+int CSProfileGenerator::MaxContextDepth = -1;
+
+bool ProfileGeneratorBase::UseFSDiscriminator = false;
+
+std::unique_ptr<ProfileGeneratorBase>
+ProfileGeneratorBase::create(ProfiledBinary *Binary,
+                             const ContextSampleCounterMap *SampleCounters,
+                             bool ProfileIsCS) {
+  std::unique_ptr<ProfileGeneratorBase> Generator;
+  if (ProfileIsCS) {
+    Generator.reset(new CSProfileGenerator(Binary, SampleCounters));
+  } else {
+    Generator.reset(new ProfileGenerator(Binary, SampleCounters));
+  }
+  ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
+  FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
+
+  return Generator;
+}
+
+std::unique_ptr<ProfileGeneratorBase>
+ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
+                             bool ProfileIsCS) {
+  std::unique_ptr<ProfileGeneratorBase> Generator;
+  if (ProfileIsCS) {
+    Generator.reset(new CSProfileGenerator(Binary, Profiles));
+  } else {
+    Generator.reset(new ProfileGenerator(Binary, std::move(Profiles)));
+  }
+  ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
+  FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
+
+  return Generator;
+}
+
+void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
+                                 SampleProfileMap &ProfileMap) {
+  // Populate profile symbol list if extended binary format is used.
+  ProfileSymbolList SymbolList;
+
+  if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
+    Binary->populateSymbolListFromDWARF(SymbolList);
+    Writer->setProfileSymbolList(&SymbolList);
+  }
+
+  if (std::error_code EC = Writer->write(ProfileMap))
+    exitWithError(std::move(EC));
+}
+
+void ProfileGeneratorBase::write() {
+  auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
+  if (std::error_code EC = WriterOrErr.getError())
+    exitWithError(EC, OutputFilename);
+
+  if (UseMD5) {
+    if (OutputFormat != SPF_Ext_Binary)
+      WithColor::warning() << "-use-md5 is ignored. Specify "
+                              "--format=extbinary to enable it\n";
+    else
+      WriterOrErr.get()->setUseMD5();
+  }
+
+  write(std::move(WriterOrErr.get()), ProfileMap);
+}
+
+void ProfileGeneratorBase::showDensitySuggestion(double Density) {
+  if (Density == 0.0)
+    WithColor::warning() << "The output profile is empty or the "
+                            "--profile-density-cutoff-hot option is "
+                            "set too low. Please check your command.\n";
+  else if (Density < ProfileDensityThreshold)
+    WithColor::warning()
+        << "Sample PGO is estimated to optimize better with "
+        << format("%.1f", ProfileDensityThreshold / Density)
+        << "x more samples. Please consider increasing sampling rate or "
+           "profiling for longer duration to get more samples.\n";
+
+  if (ShowDensity)
+    outs() << "Functions with density >= " << format("%.1f", Density)
+           << " account for "
+           << format("%.2f",
+                     static_cast<double>(ProfileDensityCutOffHot) / 10000)
+           << "% total sample counts.\n";
+}
+
+bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
+  for (const auto &Prefix : FuncPrefixsToFilter) {
+    if (FS.getFuncName().starts_with(Prefix))
+      return true;
+  }
+
+  // Filter the function profiles for the inlinees. It's useful for fuzzy
+  // profile matching which flattens the profile and inlinees' samples are
+  // merged into top-level function.
+  for (auto &Callees :
+       const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
+    auto &CalleesMap = Callees.second;
+    for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
+      auto FS = I++;
+      if (filterAmbiguousProfile(FS->second))
+        CalleesMap.erase(FS);
+    }
+  }
+  return false;
+}
+
+// For built-in local initialization function such as __cxx_global_var_init,
+// __tls_init prefix function, there could be multiple versions of the functions
+// in the final binary. However, in the profile generation, we call
+// getCanonicalFnName to canonicalize the names which strips the suffixes.
+// Therefore, samples from different functions queries the same profile and the
+// samples are merged. As the functions are essentially different, entries of
+// the merged profile are ambiguous. In sample loader, the IR from one version
+// would be attributed towards a merged entries, which is inaccurate. Especially
+// for fuzzy profile matching, it gets multiple callsites(from different
+// function) but used to match one callsite, which misleads the matching and
+// causes a lot of false positives report. Hence, we want to filter them out
+// from the profile map during the profile generation time. The profiles are all
+// cold functions, it won't have perf impact.
+void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
+  for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
+    auto FS = I++;
+    if (filterAmbiguousProfile(FS->second))
+      ProfileMap.erase(FS);
+  }
+}
+
+void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
+                                              const RangeSample &Ranges) {
+
+  /*
+  Regions may overlap with each other. Using the boundary info, find all
+  disjoint ranges and their sample count. BoundaryPoint contains the count
+  multiple samples begin/end at this points.
+
+  |<--100-->|           Sample1
+  |<------200------>|   Sample2
+  A         B       C
+
+  In the example above,
+  Sample1 begins at A, ends at B, its value is 100.
+  Sample2 beings at A, ends at C, its value is 200.
+  For A, BeginCount is the sum of sample begins at A, which is 300 and no
+  samples ends at A, so EndCount is 0.
+  Then boundary points A, B, and C with begin/end counts are:
+  A: (300, 0)
+  B: (0, 100)
+  C: (0, 200)
+  */
+  struct BoundaryPoint {
+    // Sum of sample counts beginning at this point
+    uint64_t BeginCount = UINT64_MAX;
+    // Sum of sample counts ending at this point
+    uint64_t EndCount = UINT64_MAX;
+    // Is the begin point of a zero range.
+    bool IsZeroRangeBegin = false;
+    // Is the end point of a zero range.
+    bool IsZeroRangeEnd = false;
+
+    void addBeginCount(uint64_t Count) {
+      if (BeginCount == UINT64_MAX)
+        BeginCount = 0;
+      BeginCount += Count;
+    }
+
+    void addEndCount(uint64_t Count) {
+      if (EndCount == UINT64_MAX)
+        EndCount = 0;
+      EndCount += Count;
+    }
+  };
+
+  /*
+  For the above example. With boundary points, follwing logic finds two
+  disjoint region of
+
+  [A,B]:   300
+  [B+1,C]: 200
+
+  If there is a boundary point that both begin and end, the point itself
+  becomes a separate disjoint region. For example, if we have original
+  ranges of
+
+  |<--- 100 --->|
+                |<--- 200 --->|
+  A             B             C
+
+  there are three boundary points with their begin/end counts of
+
+  A: (100, 0)
+  B: (200, 100)
+  C: (0, 200)
+
+  the disjoint ranges would be
+
+  [A, B-1]: 100
+  [B, B]:   300
+  [B+1, C]: 200.
+
+  Example for zero value range:
+
+    |<--- 100 --->|
+                       |<--- 200 --->|
+  |<---------------  0 ----------------->|
+  A  B            C    D             E   F
+
+  [A, B-1]  : 0
+  [B, C]    : 100
+  [C+1, D-1]: 0
+  [D, E]    : 200
+  [E+1, F]  : 0
+  */
+  std::map<uint64_t, BoundaryPoint> Boundaries;
+
+  for (const auto &Item : Ranges) {
+    assert(Item.first.first <= Item.first.second &&
+           "Invalid instruction range");
+    auto &BeginPoint = Boundaries[Item.first.first];
+    auto &EndPoint = Boundaries[Item.first.second];
+    uint64_t Count = Item.second;
+
+    BeginPoint.addBeginCount(Count);
+    EndPoint.addEndCount(Count);
+    if (Count == 0) {
+      BeginPoint.IsZeroRangeBegin = true;
+      EndPoint.IsZeroRangeEnd = true;
+    }
+  }
+
+  // Use UINT64_MAX to indicate there is no existing range between BeginAddress
+  // and the next valid address
+  uint64_t BeginAddress = UINT64_MAX;
+  int ZeroRangeDepth = 0;
+  uint64_t Count = 0;
+  for (const auto &Item : Boundaries) {
+    uint64_t Address = Item.first;
+    const BoundaryPoint &Point = Item.second;
+    if (Point.BeginCount != UINT64_MAX) {
+      if (BeginAddress != UINT64_MAX)
+        DisjointRanges[{BeginAddress, Address - 1}] = Count;
+      Count += Point.BeginCount;
+      BeginAddress = Address;
+      ZeroRangeDepth += Point.IsZeroRangeBegin;
+    }
+    if (Point.EndCount != UINT64_MAX) {
+      assert((BeginAddress != UINT64_MAX) &&
+             "First boundary point cannot be 'end' point");
+      DisjointRanges[{BeginAddress, Address}] = Count;
+      assert(Count >= Point.EndCount && "Mismatched live ranges");
+      Count -= Point.EndCount;
+      BeginAddress = Address + 1;
+      ZeroRangeDepth -= Point.IsZeroRangeEnd;
+      // If the remaining count is zero and it's no longer in a zero range, this
+      // means we consume all the ranges before, thus mark BeginAddress as
+      // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
+      //  [<---- 10 ---->]
+      //                       [<---- 20 ---->]
+      //   A             B     C              D
+      // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
+      // have the [B+1, C-1] zero range.
+      if (Count == 0 && ZeroRangeDepth == 0)
+        BeginAddress = UINT64_MAX;
+    }
+  }
+}
+
+void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
+    FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
+    uint64_t Count) {
+  // Use the maximum count of samples with same line location
+  uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
+
+  // Use duplication factor to compensated for loop unroll/vectorization.
+  // Note that this is only needed when we're taking MAX of the counts at
+  // the location instead of SUM.
+  Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
+
+  ErrorOr<uint64_t> R =
+      FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
+
+  uint64_t PreviousCount = R ? R.get() : 0;
+  if (PreviousCount <= Count) {
+    FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
+                                   Count - PreviousCount);
+  }
+}
+
+void ProfileGeneratorBase::updateTotalSamples() {
+  for (auto &Item : ProfileMap) {
+    FunctionSamples &FunctionProfile = Item.second;
+    FunctionProfile.updateTotalSamples();
+  }
+}
+
+void ProfileGeneratorBase::updateCallsiteSamples() {
+  for (auto &Item : ProfileMap) {
+    FunctionSamples &FunctionProfile = Item.second;
+    FunctionProfile.updateCallsiteSamples();
+  }
+}
+
+void ProfileGeneratorBase::updateFunctionSamples() {
+  updateCallsiteSamples();
+
+  if (UpdateTotalSamples)
+    updateTotalSamples();
+}
+
+void ProfileGeneratorBase::collectProfiledFunctions() {
+  std::unordered_set<const BinaryFunction *> ProfiledFunctions;
+  if (collectFunctionsFromRawProfile(ProfiledFunctions))
+    Binary->setProfiledFunctions(ProfiledFunctions);
+  else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
+    Binary->setProfiledFunctions(ProfiledFunctions);
+  else
+    llvm_unreachable("Unsupported input profile");
+}
+
+bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
+    std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
+  if (!SampleCounters)
+    return false;
+  // Go through all the stacks, ranges and branches in sample counters, use
+  // the start of the range to look up the function it belongs and record the
+  // function.
+  for (const auto &CI : *SampleCounters) {
+    if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) {
+      for (auto StackAddr : CtxKey->Context) {
+        if (FuncRange *FRange = Binary->findFuncRange(StackAddr))
+          ProfiledFunctions.insert(FRange->Func);
+      }
+    }
+
+    for (auto Item : CI.second.RangeCounter) {
+      uint64_t StartAddress = Item.first.first;
+      if (FuncRange *FRange = Binary->findFuncRange(StartAddress))
+        ProfiledFunctions.insert(FRange->Func);
+    }
+
+    for (auto Item : CI.second.BranchCounter) {
+      uint64_t SourceAddress = Item.first.first;
+      uint64_t TargetAddress = Item.first.second;
+      if (FuncRange *FRange = Binary->findFuncRange(SourceAddress))
+        ProfiledFunctions.insert(FRange->Func);
+      if (FuncRange *FRange = Binary->findFuncRange(TargetAddress))
+        ProfiledFunctions.insert(FRange->Func);
+    }
+  }
+  return true;
+}
+
+bool ProfileGenerator::collectFunctionsFromLLVMProfile(
+    std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
+  for (const auto &FS : ProfileMap) {
+    if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction()))
+      ProfiledFunctions.insert(Func);
+  }
+  return true;
+}
+
+bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
+    std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
+  for (auto *Node : ContextTracker) {
+    if (!Node->getFuncName().empty())
+      if (auto *Func = Binary->getBinaryFunction(Node->getFuncName()))
+        ProfiledFunctions.insert(Func);
+  }
+  return true;
+}
+
+FunctionSamples &
+ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
+  SampleContext Context(FuncName);
+  return ProfileMap.create(Context);
+}
+
+void ProfileGenerator::generateProfile() {
+  collectProfiledFunctions();
+
+  if (Binary->usePseudoProbes())
+    Binary->decodePseudoProbe();
+
+  if (SampleCounters) {
+    if (Binary->usePseudoProbes()) {
+      generateProbeBasedProfile();
+    } else {
+      generateLineNumBasedProfile();
+    }
+  }
+
+  postProcessProfiles();
+}
+
+void ProfileGenerator::postProcessProfiles() {
+  computeSummaryAndThreshold(ProfileMap);
+  trimColdProfiles(ProfileMap, ColdCountThreshold);
+  filterAmbiguousProfile(ProfileMap);
+  calculateAndShowDensity(ProfileMap);
+}
+
+void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
+                                        uint64_t ColdCntThreshold) {
+  if (!TrimColdProfile)
+    return;
+
+  // Move cold profiles into a tmp container.
+  std::vector<hash_code> ColdProfileHashes;
+  for (const auto &I : ProfileMap) {
+    if (I.second.getTotalSamples() < ColdCntThreshold)
+      ColdProfileHashes.emplace_back(I.first);
+  }
+
+  // Remove the cold profile from ProfileMap.
+  for (const auto &I : ColdProfileHashes)
+    ProfileMap.erase(I);
+}
+
+void ProfileGenerator::generateLineNumBasedProfile() {
+  assert(SampleCounters->size() == 1 &&
+         "Must have one entry for profile generation.");
+  const SampleCounter &SC = SampleCounters->begin()->second;
+  // Fill in function body samples
+  populateBodySamplesForAllFunctions(SC.RangeCounter);
+  // Fill in boundary sample counts as well as call site samples for calls
+  populateBoundarySamplesForAllFunctions(SC.BranchCounter);
+
+  updateFunctionSamples();
+}
+
+void ProfileGenerator::generateProbeBasedProfile() {
+  assert(SampleCounters->size() == 1 &&
+         "Must have one entry for profile generation.");
+  // Enable pseudo probe functionalities in SampleProf
+  FunctionSamples::ProfileIsProbeBased = true;
+  const SampleCounter &SC = SampleCounters->begin()->second;
+  // Fill in function body samples
+  populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter);
+  // Fill in boundary sample counts as well as call site samples for calls
+  populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter);
+
+  updateFunctionSamples();
+}
+
+void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
+    const RangeSample &RangeCounter) {
+  ProbeCounterMap ProbeCounter;
+  // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
+  // inside extractProbesFromRange.
+  extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter,
+                         false);
+
+  for (const auto &PI : ProbeCounter) {
+    const MCDecodedPseudoProbe *Probe = PI.first;
+    uint64_t Count = PI.second;
+    SampleContextFrameVector FrameVec;
+    Binary->getInlineContextForProbe(Probe, FrameVec, true);
+    FunctionSamples &FunctionProfile =
+        getLeafProfileAndAddTotalSamples(FrameVec, Count);
+    FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
+                                   Count);
+    if (Probe->isEntry())
+      FunctionProfile.addHeadSamples(Count);
+  }
+}
+
+void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
+    const BranchSample &BranchCounters) {
+  for (const auto &Entry : BranchCounters) {
+    uint64_t SourceAddress = Entry.first.first;
+    uint64_t TargetAddress = Entry.first.second;
+    uint64_t Count = Entry.second;
+    assert(Count != 0 && "Unexpected zero weight branch");
+
+    StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
+    if (CalleeName.size() == 0)
+      continue;
+
+    const MCDecodedPseudoProbe *CallProbe =
+        Binary->getCallProbeForAddr(SourceAddress);
+    if (CallProbe == nullptr)
+      continue;
+
+    // Record called target sample and its count.
+    SampleContextFrameVector FrameVec;
+    Binary->getInlineContextForProbe(CallProbe, FrameVec, true);
+
+    if (!FrameVec.empty()) {
+      FunctionSamples &FunctionProfile =
+          getLeafProfileAndAddTotalSamples(FrameVec, 0);
+      FunctionProfile.addCalledTargetSamples(
+          FrameVec.back().Location.LineOffset,
+          FrameVec.back().Location.Discriminator,
+          FunctionId(CalleeName), Count);
+    }
+  }
+}
+
+FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
+    const SampleContextFrameVector &FrameVec, uint64_t Count) {
+  // Get top level profile
+  FunctionSamples *FunctionProfile =
+      &getTopLevelFunctionProfile(FrameVec[0].Func);
+  FunctionProfile->addTotalSamples(Count);
+  if (Binary->usePseudoProbes()) {
+    const auto *FuncDesc = Binary->getFuncDescForGUID(
+        FunctionProfile->getFunction().getHashCode());
+    FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
+  }
+
+  for (size_t I = 1; I < FrameVec.size(); I++) {
+    LineLocation Callsite(
+        FrameVec[I - 1].Location.LineOffset,
+        getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
+    FunctionSamplesMap &SamplesMap =
+        FunctionProfile->functionSamplesAt(Callsite);
+    auto Ret =
+        SamplesMap.emplace(FrameVec[I].Func, FunctionSamples());
+    if (Ret.second) {
+      SampleContext Context(FrameVec[I].Func);
+      Ret.first->second.setContext(Context);
+    }
+    FunctionProfile = &Ret.first->second;
+    FunctionProfile->addTotalSamples(Count);
+    if (Binary->usePseudoProbes()) {
+      const auto *FuncDesc = Binary->getFuncDescForGUID(
+          FunctionProfile->getFunction().getHashCode());
+      FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
+    }
+  }
+
+  return *FunctionProfile;
+}
+
+RangeSample
+ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
+  RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
+  if (FillZeroForAllFuncs) {
+    for (auto &FuncI : Binary->getAllBinaryFunctions()) {
+      for (auto &R : FuncI.second.Ranges) {
+        Ranges[{R.first, R.second - 1}] += 0;
+      }
+    }
+  } else {
+    // For each range, we search for all ranges of the function it belongs to
+    // and initialize it with zero count, so it remains zero if doesn't hit any
+    // samples. This is to be consistent with compiler that interpret zero count
+    // as unexecuted(cold).
+    for (const auto &I : RangeCounter) {
+      uint64_t StartAddress = I.first.first;
+      for (const auto &Range : Binary->getRanges(StartAddress))
+        Ranges[{Range.first, Range.second - 1}] += 0;
+    }
+  }
+  RangeSample DisjointRanges;
+  findDisjointRanges(DisjointRanges, Ranges);
+  return DisjointRanges;
+}
+
+void ProfileGenerator::populateBodySamplesForAllFunctions(
+    const RangeSample &RangeCounter) {
+  for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
+    uint64_t RangeBegin = Range.first.first;
+    uint64_t RangeEnd = Range.first.second;
+    uint64_t Count = Range.second;
+
+    InstructionPointer IP(Binary, RangeBegin, true);
+    // Disjoint ranges may have range in the middle of two instr,
+    // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+    // can be Addr1+1 to Addr2-1. We should ignore such range.
+    if (IP.Address > RangeEnd)
+      continue;
+
+    do {
+      const SampleContextFrameVector FrameVec =
+          Binary->getFrameLocationStack(IP.Address);
+      if (!FrameVec.empty()) {
+        // FIXME: As accumulating total count per instruction caused some
+        // regression, we changed to accumulate total count per byte as a
+        // workaround. Tuning hotness threshold on the compiler side might be
+        // necessary in the future.
+        FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
+            FrameVec, Count * Binary->getInstSize(IP.Address));
+        updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(),
+                                            Count);
+      }
+    } while (IP.advance() && IP.Address <= RangeEnd);
+  }
+}
+
+StringRef
+ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
+  // Get the function range by branch target if it's a call branch.
+  auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress);
+
+  // We won't accumulate sample count for a range whose start is not the real
+  // function entry such as outlined function or inner labels.
+  if (!FRange || !FRange->IsFuncEntry)
+    return StringRef();
+
+  return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
+}
+
+void ProfileGenerator::populateBoundarySamplesForAllFunctions(
+    const BranchSample &BranchCounters) {
+  for (const auto &Entry : BranchCounters) {
+    uint64_t SourceAddress = Entry.first.first;
+    uint64_t TargetAddress = Entry.first.second;
+    uint64_t Count = Entry.second;
+    assert(Count != 0 && "Unexpected zero weight branch");
+
+    StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
+    if (CalleeName.size() == 0)
+      continue;
+    // Record called target sample and its count.
+    const SampleContextFrameVector &FrameVec =
+        Binary->getCachedFrameLocationStack(SourceAddress);
+    if (!FrameVec.empty()) {
+      FunctionSamples &FunctionProfile =
+          getLeafProfileAndAddTotalSamples(FrameVec, 0);
+      FunctionProfile.addCalledTargetSamples(
+          FrameVec.back().Location.LineOffset,
+          getBaseDiscriminator(FrameVec.back().Location.Discriminator),
+          FunctionId(CalleeName), Count);
+    }
+    // Add head samples for callee.
+    FunctionSamples &CalleeProfile =
+        getTopLevelFunctionProfile(FunctionId(CalleeName));
+    CalleeProfile.addHeadSamples(Count);
+  }
+}
+
+void ProfileGeneratorBase::calculateBodySamplesAndSize(
+    const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
+    uint64_t &FuncBodySize) {
+  // Note that ideally the size should be the number of function instruction.
+  // However, for probe-based profile, we don't have the accurate instruction
+  // count for each probe, instead, the probe sample is the samples count for
+  // the block, which is equivelant to
+  // total_instruction_samples/num_of_instruction in one block. Hence, we use
+  // the number of probe as a proxy for the function's size.
+  FuncBodySize += FSamples.getBodySamples().size();
+
+  // The accumulated body samples re-calculated here could be different from the
+  // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
+  // based profile. The reason is that TotalSamples is the sum of all the
+  // samples of the machine instruction in one source-code line, however, the
+  // entry of Bodysamples is the only max number of them, so the TotalSamples is
+  // usually much bigger than the accumulated body samples as one souce-code
+  // line can emit many machine instructions. We observed a regression when we
+  // switched to use the accumulated body samples(by using
+  // -update-total-samples). Hence, it's safer to re-calculate here to avoid
+  // such discrepancy. There is no problem for probe-based profile, as the
+  // TotalSamples is exactly the same as the accumulated body samples.
+  for (const auto &I : FSamples.getBodySamples())
+    TotalBodySamples += I.second.getSamples();
+
+  for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
+    for (const auto &Callee : CallsiteSamples.second) {
+      // For binary-level density, the inlinees' samples and size should be
+      // included in the calculation.
+      calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
+                                  FuncBodySize);
+    }
+}
+
+// Calculate Profile-density:
+// Calculate the density for each function and sort them in descending order,
+// keep accumulating their total samples unitl it exceeds the
+// percentage_threshold(cut-off) of total profile samples, the profile-density
+// is the last(minimum) function-density of the processed functions, which means
+// all the functions hot to perf are on good density if the profile-density is
+// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
+// depending on how much regression the system want to tolerate.
+double
+ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
+  double ProfileDensity = 0.0;
+
+  uint64_t TotalProfileSamples = 0;
+  // A list of the function profile density and its total samples.
+  std::vector<std::pair<double, uint64_t>> FuncDensityList;
+  for (const auto &I : Profiles) {
+    uint64_t TotalBodySamples = 0;
+    uint64_t FuncBodySize = 0;
+    calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
+
+    if (FuncBodySize == 0)
+      continue;
+
+    double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+    TotalProfileSamples += TotalBodySamples;
+    FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
+  }
+
+  // Sorted by the density in descending order.
+  llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
+                                         const std::pair<double, uint64_t> &B) {
+    if (A.first != B.first)
+      return A.first > B.first;
+    return A.second < B.second;
+  });
+
+  uint64_t AccumulatedSamples = 0;
+  uint32_t I = 0;
+  assert(ProfileDensityCutOffHot <= 1000000 &&
+         "The cutoff value is greater than 1000000(100%)");
+  while (AccumulatedSamples < TotalProfileSamples *
+                                  static_cast<float>(ProfileDensityCutOffHot) /
+                                  1000000 &&
+         I < FuncDensityList.size()) {
+    AccumulatedSamples += FuncDensityList[I].second;
+    ProfileDensity = FuncDensityList[I].first;
+    I++;
+  }
+
+  return ProfileDensity;
+}
+
+void ProfileGeneratorBase::calculateAndShowDensity(
+    const SampleProfileMap &Profiles) {
+  double Density = calculateDensity(Profiles);
+  showDensitySuggestion(Density);
+}
+
+FunctionSamples *
+CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
+                                               bool WasLeafInlined) {
+  FunctionSamples *FProfile = ContextNode->getFunctionSamples();
+  if (!FProfile) {
+    FSamplesList.emplace_back();
+    FProfile = &FSamplesList.back();
+    FProfile->setFunction(ContextNode->getFuncName());
+    ContextNode->setFunctionSamples(FProfile);
+  }
+  // Update ContextWasInlined attribute for existing contexts.
+  // The current function can be called in two ways:
+  //  - when processing a probe of the current frame
+  //  - when processing the entry probe of an inlinee's frame, which
+  //    is then used to update the callsite count of the current frame.
+  // The two can happen in any order, hence here we are making sure
+  // `ContextWasInlined` is always set as expected.
+  // TODO: Note that the former does not always happen if no probes of the
+  // current frame has samples, and if the latter happens, we could lose the
+  // attribute. This should be fixed.
+  if (WasLeafInlined)
+    FProfile->getContext().setAttribute(ContextWasInlined);
+  return FProfile;
+}
+
+ContextTrieNode *
+CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
+                                           bool WasLeafInlined) {
+  ContextTrieNode *ContextNode =
+      ContextTracker.getOrCreateContextPath(Context, true);
+  getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
+  return ContextNode;
+}
+
+void CSProfileGenerator::generateProfile() {
+  FunctionSamples::ProfileIsCS = true;
+
+  collectProfiledFunctions();
+
+  if (Binary->usePseudoProbes()) {
+    Binary->decodePseudoProbe();
+    if (InferMissingFrames)
+      initializeMissingFrameInferrer();
+  }
+
+  if (SampleCounters) {
+    if (Binary->usePseudoProbes()) {
+      generateProbeBasedProfile();
+    } else {
+      generateLineNumBasedProfile();
+    }
+  }
+
+  if (Binary->getTrackFuncContextSize())
+    computeSizeForProfiledFunctions();
+
+  postProcessProfiles();
+}
+
+void CSProfileGenerator::initializeMissingFrameInferrer() {
+  Binary->getMissingContextInferrer()->initialize(SampleCounters);
+}
+
+void CSProfileGenerator::inferMissingFrames(
+    const SmallVectorImpl<uint64_t> &Context,
+    SmallVectorImpl<uint64_t> &NewContext) {
+  Binary->inferMissingFrames(Context, NewContext);
+}
+
+void CSProfileGenerator::computeSizeForProfiledFunctions() {
+  for (auto *Func : Binary->getProfiledFunctions())
+    Binary->computeInlinedContextSizeForFunc(Func);
+
+  // Flush the symbolizer to save memory.
+  Binary->flushSymbolizer();
+}
+
+void CSProfileGenerator::updateFunctionSamples() {
+  for (auto *Node : ContextTracker) {
+    FunctionSamples *FSamples = Node->getFunctionSamples();
+    if (FSamples) {
+      if (UpdateTotalSamples)
+        FSamples->updateTotalSamples();
+      FSamples->updateCallsiteSamples();
+    }
+  }
+}
+
+void CSProfileGenerator::generateLineNumBasedProfile() {
+  for (const auto &CI : *SampleCounters) {
+    const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr());
+
+    ContextTrieNode *ContextNode = &getRootContext();
+    // Sample context will be empty if the jump is an external-to-internal call
+    // pattern, the head samples should be added for the internal function.
+    if (!CtxKey->Context.empty()) {
+      // Get or create function profile for the range
+      ContextNode =
+          getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined);
+      // Fill in function body samples
+      populateBodySamplesForFunction(*ContextNode->getFunctionSamples(),
+                                     CI.second.RangeCounter);
+    }
+    // Fill in boundary sample counts as well as call site samples for calls
+    populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter);
+  }
+  // Fill in call site value sample for inlined calls and also use context to
+  // infer missing samples. Since we don't have call count for inlined
+  // functions, we estimate it from inlinee's profile using the entry of the
+  // body sample.
+  populateInferredFunctionSamples(getRootContext());
+
+  updateFunctionSamples();
+}
+
+void CSProfileGenerator::populateBodySamplesForFunction(
+    FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
+  // Compute disjoint ranges first, so we can use MAX
+  // for calculating count for each location.
+  RangeSample Ranges;
+  findDisjointRanges(Ranges, RangeCounter);
+  for (const auto &Range : Ranges) {
+    uint64_t RangeBegin = Range.first.first;
+    uint64_t RangeEnd = Range.first.second;
+    uint64_t Count = Range.second;
+    // Disjoint ranges have introduce zero-filled gap that
+    // doesn't belong to current context, filter them out.
+    if (Count == 0)
+      continue;
+
+    InstructionPointer IP(Binary, RangeBegin, true);
+    // Disjoint ranges may have range in the middle of two instr,
+    // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+    // can be Addr1+1 to Addr2-1. We should ignore such range.
+    if (IP.Address > RangeEnd)
+      continue;
+
+    do {
+      auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address);
+      if (LeafLoc) {
+        // Recording body sample for this specific context
+        updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
+        FunctionProfile.addTotalSamples(Count);
+      }
+    } while (IP.advance() && IP.Address <= RangeEnd);
+  }
+}
+
+void CSProfileGenerator::populateBoundarySamplesForFunction(
+    ContextTrieNode *Node, const BranchSample &BranchCounters) {
+
+  for (const auto &Entry : BranchCounters) {
+    uint64_t SourceAddress = Entry.first.first;
+    uint64_t TargetAddress = Entry.first.second;
+    uint64_t Count = Entry.second;
+    assert(Count != 0 && "Unexpected zero weight branch");
+
+    StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
+    if (CalleeName.size() == 0)
+      continue;
+
+    ContextTrieNode *CallerNode = Node;
+    LineLocation CalleeCallSite(0, 0);
+    if (CallerNode != &getRootContext()) {
+      // Record called target sample and its count
+      auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress);
+      if (LeafLoc) {
+        CallerNode->getFunctionSamples()->addCalledTargetSamples(
+            LeafLoc->Location.LineOffset,
+            getBaseDiscriminator(LeafLoc->Location.Discriminator),
+            FunctionId(CalleeName),
+            Count);
+        // Record head sample for called target(callee)
+        CalleeCallSite = LeafLoc->Location;
+      }
+    }
+
+    ContextTrieNode *CalleeNode =
+        CallerNode->getOrCreateChildContext(CalleeCallSite,
+                                            FunctionId(CalleeName));
+    FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode);
+    CalleeProfile->addHeadSamples(Count);
+  }
+}
+
+void CSProfileGenerator::populateInferredFunctionSamples(
+    ContextTrieNode &Node) {
+  // There is no call jmp sample between the inliner and inlinee, we need to use
+  // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
+  // sample depends on child(inlinee)'s sample, so traverse the tree in
+  // post-order.
+  for (auto &It : Node.getAllChildContext())
+    populateInferredFunctionSamples(It.second);
+
+  FunctionSamples *CalleeProfile = Node.getFunctionSamples();
+  if (!CalleeProfile)
+    return;
+  // If we already have head sample counts, we must have value profile
+  // for call sites added already. Skip to avoid double counting.
+  if (CalleeProfile->getHeadSamples())
+    return;
+  ContextTrieNode *CallerNode = Node.getParentContext();
+  // If we don't have context, nothing to do for caller's call site.
+  // This could happen for entry point function.
+  if (CallerNode == &getRootContext())
+    return;
+
+  LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
+  FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode);
+  // Since we don't have call count for inlined functions, we
+  // estimate it from inlinee's profile using entry body sample.
+  uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
+  // If we don't have samples with location, use 1 to indicate live.
+  if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
+    EstimatedCallCount = 1;
+  CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset,
+                                       CallerLeafFrameLoc.Discriminator,
+                                       Node.getFuncName(), EstimatedCallCount);
+  CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset,
+                               CallerLeafFrameLoc.Discriminator,
+                               EstimatedCallCount);
+  CallerProfile.addTotalSamples(EstimatedCallCount);
+}
+
+void CSProfileGenerator::convertToProfileMap(
+    ContextTrieNode &Node, SampleContextFrameVector &Context) {
+  FunctionSamples *FProfile = Node.getFunctionSamples();
+  if (FProfile) {
+    Context.emplace_back(Node.getFuncName(), LineLocation(0, 0));
+    // Save the new context for future references.
+    SampleContextFrames NewContext = *Contexts.insert(Context).first;
+    auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile));
+    FunctionSamples &NewProfile = Ret.first->second;
+    NewProfile.getContext().setContext(NewContext);
+    Context.pop_back();
+  }
+
+  for (auto &It : Node.getAllChildContext()) {
+    ContextTrieNode &ChildNode = It.second;
+    Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc());
+    convertToProfileMap(ChildNode, Context);
+    Context.pop_back();
+  }
+}
+
+void CSProfileGenerator::convertToProfileMap() {
+  assert(ProfileMap.empty() &&
+         "ProfileMap should be empty before converting from the trie");
+  assert(IsProfileValidOnTrie &&
+         "Do not convert the trie twice, it's already destroyed");
+
+  SampleContextFrameVector Context;
+  for (auto &It : getRootContext().getAllChildContext())
+    convertToProfileMap(It.second, Context);
+
+  IsProfileValidOnTrie = false;
+}
+
+void CSProfileGenerator::postProcessProfiles() {
+  // Compute hot/cold threshold based on profile. This will be used for cold
+  // context profile merging/trimming.
+  computeSummaryAndThreshold();
+
+  // Run global pre-inliner to adjust/merge context profile based on estimated
+  // inline decisions.
+  if (EnableCSPreInliner) {
+    ContextTracker.populateFuncToCtxtMap();
+    CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
+    // Turn off the profile merger by default unless it is explicitly enabled.
+    if (!CSProfMergeColdContext.getNumOccurrences())
+      CSProfMergeColdContext = false;
+  }
+
+  convertToProfileMap();
+
+  // Trim and merge cold context profile using cold threshold above.
+  if (TrimColdProfile || CSProfMergeColdContext) {
+    SampleContextTrimmer(ProfileMap)
+        .trimAndMergeColdContextProfiles(
+            HotCountThreshold, TrimColdProfile, CSProfMergeColdContext,
+            CSProfMaxColdContextDepth, EnableCSPreInliner);
+  }
+
+  if (GenCSNestedProfile) {
+    ProfileConverter CSConverter(ProfileMap);
+    CSConverter.convertCSProfiles();
+    FunctionSamples::ProfileIsCS = false;
+  }
+  filterAmbiguousProfile(ProfileMap);
+  ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
+}
+
+void ProfileGeneratorBase::computeSummaryAndThreshold(
+    SampleProfileMap &Profiles) {
+  SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
+  Summary = Builder.computeSummaryForProfiles(Profiles);
+  HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
+      (Summary->getDetailedSummary()));
+  ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
+      (Summary->getDetailedSummary()));
+}
+
+void CSProfileGenerator::computeSummaryAndThreshold() {
+  // Always merge and use context-less profile map to compute summary.
+  SampleProfileMap ContextLessProfiles;
+  ContextTracker.createContextLessProfileMap(ContextLessProfiles);
+
+  // Set the flag below to avoid merging the profile again in
+  // computeSummaryAndThreshold
+  FunctionSamples::ProfileIsCS = false;
+  assert(
+      (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
+      "Don't set --profile-summary-contextless to false for profile "
+      "generation");
+  ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles);
+  // Recover the old value.
+  FunctionSamples::ProfileIsCS = true;
+}
+
+void ProfileGeneratorBase::extractProbesFromRange(
+    const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
+    bool FindDisjointRanges) {
+  const RangeSample *PRanges = &RangeCounter;
+  RangeSample Ranges;
+  if (FindDisjointRanges) {
+    findDisjointRanges(Ranges, RangeCounter);
+    PRanges = &Ranges;
+  }
+
+  for (const auto &Range : *PRanges) {
+    uint64_t RangeBegin = Range.first.first;
+    uint64_t RangeEnd = Range.first.second;
+    uint64_t Count = Range.second;
+
+    InstructionPointer IP(Binary, RangeBegin, true);
+    // Disjoint ranges may have range in the middle of two instr,
+    // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+    // can be Addr1+1 to Addr2-1. We should ignore such range.
+    if (IP.Address > RangeEnd)
+      continue;
+
+    do {
+      const AddressProbesMap &Address2ProbesMap =
+          Binary->getAddress2ProbesMap();
+      for (const MCDecodedPseudoProbe &Probe :
+           Address2ProbesMap.find(IP.Address)) {
+        ProbeCounter[&Probe] += Count;
+      }
+    } while (IP.advance() && IP.Address <= RangeEnd);
+  }
+}
+
+static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
+                                      const SmallVectorImpl<uint64_t> &AddrVec,
+                                      ProfiledBinary *Binary) {
+  SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
+  for (auto Address : reverse(AddrVec)) {
+    const MCDecodedPseudoProbe *CallProbe =
+        Binary->getCallProbeForAddr(Address);
+    // These could be the cases when a probe is not found at a calliste. Cutting
+    // off the context from here since the inliner will not know how to consume
+    // a context with unknown callsites.
+    // 1. for functions that are not sampled when
+    // --decode-probe-for-profiled-functions-only is on.
+    // 2. for a merged callsite. Callsite merging may cause the loss of original
+    // probe IDs.
+    // 3. for an external callsite.
+    if (!CallProbe)
+      break;
+    Probes.push_back(CallProbe);
+  }
+
+  std::reverse(Probes.begin(), Probes.end());
+
+  // Extract context stack for reusing, leaf context stack will be added
+  // compressed while looking up function profile.
+  for (const auto *P : Probes) {
+    Binary->getInlineContextForProbe(P, ContextStack, true);
+  }
+}
+
+void CSProfileGenerator::generateProbeBasedProfile() {
+  // Enable pseudo probe functionalities in SampleProf
+  FunctionSamples::ProfileIsProbeBased = true;
+  for (const auto &CI : *SampleCounters) {
+    const AddrBasedCtxKey *CtxKey =
+        dyn_cast<AddrBasedCtxKey>(CI.first.getPtr());
+    // Fill in function body samples from probes, also infer caller's samples
+    // from callee's probe
+    populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey);
+    // Fill in boundary samples for a call probe
+    populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey);
+  }
+}
+
+void CSProfileGenerator::populateBodySamplesWithProbes(
+    const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
+  ProbeCounterMap ProbeCounter;
+  // Extract the top frame probes by looking up each address among the range in
+  // the Address2ProbeMap
+  extractProbesFromRange(RangeCounter, ProbeCounter);
+  std::unordered_map<MCDecodedPseudoProbeInlineTree *,
+                     std::unordered_set<FunctionSamples *>>
+      FrameSamples;
+  for (const auto &PI : ProbeCounter) {
+    const MCDecodedPseudoProbe *Probe = PI.first;
+    uint64_t Count = PI.second;
+    // Disjoint ranges have introduce zero-filled gap that
+    // doesn't belong to current context, filter them out.
+    if (!Probe->isBlock() || Count == 0)
+      continue;
+
+    ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe);
+    FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
+    // Record the current frame and FunctionProfile whenever samples are
+    // collected for non-danglie probes. This is for reporting all of the
+    // zero count probes of the frame later.
+    FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile);
+    FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
+                                   Count);
+    FunctionProfile.addTotalSamples(Count);
+    if (Probe->isEntry()) {
+      FunctionProfile.addHeadSamples(Count);
+      // Look up for the caller's function profile
+      const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
+      ContextTrieNode *CallerNode = ContextNode->getParentContext();
+      if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
+        // Since the context id will be compressed, we have to use callee's
+        // context id to infer caller's context id to ensure they share the
+        // same context prefix.
+        uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
+        uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
+        assert(CallerIndex &&
+               "Inferred caller's location index shouldn't be zero!");
+        assert(!CallerDiscriminator &&
+               "Callsite probe should not have a discriminator!");
+        FunctionSamples &CallerProfile =
+            *getOrCreateFunctionSamples(CallerNode);
+        CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
+        CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count);
+        CallerProfile.addTotalSamples(Count);
+        CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator,
+                                             ContextNode->getFuncName(), Count);
+      }
+    }
+  }
+
+  // Assign zero count for remaining probes without sample hits to
+  // differentiate from probes optimized away, of which the counts are unknown
+  // and will be inferred by the compiler.
+  for (auto &I : FrameSamples) {
+    for (auto *FunctionProfile : I.second) {
+      for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
+        FunctionProfile->addBodySamples(Probe.getIndex(),
+                                        Probe.getDiscriminator(), 0);
+      }
+    }
+  }
+}
+
+void CSProfileGenerator::populateBoundarySamplesWithProbes(
+    const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
+  for (const auto &BI : BranchCounter) {
+    uint64_t SourceAddress = BI.first.first;
+    uint64_t TargetAddress = BI.first.second;
+    uint64_t Count = BI.second;
+    const MCDecodedPseudoProbe *CallProbe =
+        Binary->getCallProbeForAddr(SourceAddress);
+    if (CallProbe == nullptr)
+      continue;
+    FunctionSamples &FunctionProfile =
+        getFunctionProfileForLeafProbe(CtxKey, CallProbe);
+    FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
+    FunctionProfile.addTotalSamples(Count);
+    StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
+    if (CalleeName.size() == 0)
+      continue;
+    FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(),
+                                           CallProbe->getDiscriminator(),
+                                           FunctionId(CalleeName), Count);
+  }
+}
+
+ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
+    const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
+
+  const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
+  SmallVector<uint64_t, 16> NewContext;
+
+  if (InferMissingFrames) {
+    SmallVector<uint64_t, 16> Context = CtxKey->Context;
+    // Append leaf frame for a complete inference.
+    Context.push_back(LeafProbe->getAddress());
+    inferMissingFrames(Context, NewContext);
+    // Pop out the leaf probe that was pushed in above.
+    NewContext.pop_back();
+    PContext = &NewContext;
+  }
+
+  SampleContextFrameVector ContextStack;
+  extractPrefixContextStack(ContextStack, *PContext, Binary);
+
+  // Explicitly copy the context for appending the leaf context
+  SampleContextFrameVector NewContextStack(ContextStack.begin(),
+                                           ContextStack.end());
+  Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true);
+  // For leaf inlined context with the top frame, we should strip off the top
+  // frame's probe id, like:
+  // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
+  auto LeafFrame = NewContextStack.back();
+  LeafFrame.Location = LineLocation(0, 0);
+  NewContextStack.pop_back();
+  // Compress the context string except for the leaf frame
+  CSProfileGenerator::compressRecursionContext(NewContextStack);
+  CSProfileGenerator::trimContext(NewContextStack);
+  NewContextStack.push_back(LeafFrame);
+
+  const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
+  bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
+  ContextTrieNode *ContextNode =
+      getOrCreateContextNode(NewContextStack, WasLeafInlined);
+  ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
+  return ContextNode;
+}
+
+FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
+    const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
+  return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
+}
+
+} // end namespace sampleprof
+} // end namespace llvm
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.h
new file mode 100644
index 0000000000..5e36128530
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/ProfileGenerator.h
@@ -0,0 +1,401 @@
+//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
+#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
+#include "CSPreInliner.h"
+#include "ErrorHandling.h"
+#include "PerfReader.h"
+#include "ProfiledBinary.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include <memory>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+using ProbeCounterMap =
+    std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>;
+
+// This base class for profile generation of sample-based PGO. We reuse all
+// structures relating to function profiles and profile writers as seen in
+// /ProfileData/SampleProf.h.
+class ProfileGeneratorBase {
+
+public:
+  ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){};
+  ProfileGeneratorBase(ProfiledBinary *Binary,
+                       const ContextSampleCounterMap *Counters)
+      : Binary(Binary), SampleCounters(Counters){};
+  ProfileGeneratorBase(ProfiledBinary *Binary,
+                       const SampleProfileMap &&Profiles)
+      : Binary(Binary), ProfileMap(std::move(Profiles)){};
+
+  virtual ~ProfileGeneratorBase() = default;
+  static std::unique_ptr<ProfileGeneratorBase>
+  create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters,
+         bool profileIsCS);
+  static std::unique_ptr<ProfileGeneratorBase>
+  create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap,
+         bool profileIsCS);
+  virtual void generateProfile() = 0;
+  void write();
+
+  static uint32_t
+  getDuplicationFactor(unsigned Discriminator,
+                       bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) {
+    return UseFSD ? 1
+                  : llvm::DILocation::getDuplicationFactorFromDiscriminator(
+                        Discriminator);
+  }
+
+  static uint32_t
+  getBaseDiscriminator(unsigned Discriminator,
+                       bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) {
+    return UseFSD ? Discriminator
+                  : DILocation::getBaseDiscriminatorFromDiscriminator(
+                        Discriminator, /* IsFSDiscriminator */ false);
+  }
+
+  static bool UseFSDiscriminator;
+
+protected:
+  // Use SampleProfileWriter to serialize profile map
+  void write(std::unique_ptr<SampleProfileWriter> Writer,
+             SampleProfileMap &ProfileMap);
+  /*
+  For each region boundary point, mark if it is begin or end (or both) of
+  the region. Boundary points are inclusive. Log the sample count as well
+  so we can use it when we compute the sample count of each disjoint region
+  later. Note that there might be multiple ranges with different sample
+  count that share same begin/end point. We need to accumulate the sample
+  count for the boundary point for such case, because for the example
+  below,
+
+  |<--100-->|
+  |<------200------>|
+  A         B       C
+
+  sample count for disjoint region [A,B] would be 300.
+  */
+  void findDisjointRanges(RangeSample &DisjointRanges,
+                          const RangeSample &Ranges);
+
+  // Go through each address from range to extract the top frame probe by
+  // looking up in the Address2ProbeMap
+  void extractProbesFromRange(const RangeSample &RangeCounter,
+                              ProbeCounterMap &ProbeCounter,
+                              bool FindDisjointRanges = true);
+
+  // Helper function for updating body sample for a leaf location in
+  // FunctionProfile
+  void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile,
+                                           const SampleContextFrame &LeafLoc,
+                                           uint64_t Count);
+
+  void updateFunctionSamples();
+
+  void updateTotalSamples();
+
+  void updateCallsiteSamples();
+
+  void filterAmbiguousProfile(SampleProfileMap &Profiles);
+
+  bool filterAmbiguousProfile(FunctionSamples &FS);
+
+  StringRef getCalleeNameForAddress(uint64_t TargetAddress);
+
+  void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
+
+  void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
+                                   uint64_t &TotalBodySamples,
+                                   uint64_t &FuncBodySize);
+
+  double calculateDensity(const SampleProfileMap &Profiles);
+
+  void calculateAndShowDensity(const SampleProfileMap &Profiles);
+
+  void showDensitySuggestion(double Density);
+
+  void collectProfiledFunctions();
+
+  bool collectFunctionsFromRawProfile(
+      std::unordered_set<const BinaryFunction *> &ProfiledFunctions);
+
+  // Collect profiled Functions for llvm sample profile input.
+  virtual bool collectFunctionsFromLLVMProfile(
+      std::unordered_set<const BinaryFunction *> &ProfiledFunctions) = 0;
+
+  // List of function prefix to filter out.
+  static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init",
+                                                        "__tls_init"};
+
+  // Thresholds from profile summary to answer isHotCount/isColdCount queries.
+  uint64_t HotCountThreshold;
+
+  uint64_t ColdCountThreshold;
+
+  ProfiledBinary *Binary = nullptr;
+
+  std::unique_ptr<ProfileSummary> Summary;
+
+  // Used by SampleProfileWriter
+  SampleProfileMap ProfileMap;
+
+  const ContextSampleCounterMap *SampleCounters = nullptr;
+};
+
+class ProfileGenerator : public ProfileGeneratorBase {
+
+public:
+  ProfileGenerator(ProfiledBinary *Binary,
+                   const ContextSampleCounterMap *Counters)
+      : ProfileGeneratorBase(Binary, Counters){};
+  ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles)
+      : ProfileGeneratorBase(Binary, std::move(Profiles)){};
+  void generateProfile() override;
+
+private:
+  void generateLineNumBasedProfile();
+  void generateProbeBasedProfile();
+  RangeSample preprocessRangeCounter(const RangeSample &RangeCounter);
+  FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName);
+  // Helper function to get the leaf frame's FunctionProfile by traversing the
+  // inline stack and meanwhile it adds the total samples for each frame's
+  // function profile.
+  FunctionSamples &
+  getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec,
+                                   uint64_t Count);
+  void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter);
+  void
+  populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters);
+  void
+  populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter);
+  void populateBoundarySamplesWithProbesForAllFunctions(
+      const BranchSample &BranchCounters);
+  void postProcessProfiles();
+  void trimColdProfiles(const SampleProfileMap &Profiles,
+                        uint64_t ColdCntThreshold);
+  bool collectFunctionsFromLLVMProfile(
+      std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override;
+};
+
+class CSProfileGenerator : public ProfileGeneratorBase {
+public:
+  CSProfileGenerator(ProfiledBinary *Binary,
+                     const ContextSampleCounterMap *Counters)
+      : ProfileGeneratorBase(Binary, Counters){};
+  CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles)
+      : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){};
+  void generateProfile() override;
+
+  // Trim the context stack at a given depth.
+  template <typename T>
+  static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) {
+    if (Depth < 0 || static_cast<size_t>(Depth) >= S.size())
+      return;
+    std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(),
+              S.begin());
+    S.resize(Depth);
+  }
+
+  // Remove adjacent repeated context sequences up to a given sequence length,
+  // -1 means no size limit. Note that repeated sequences are identified based
+  // on the exact call site, this is finer granularity than function recursion.
+  template <typename T>
+  static void compressRecursionContext(SmallVectorImpl<T> &Context,
+                                       int32_t CSize = MaxCompressionSize) {
+    uint32_t I = 1;
+    uint32_t HS = static_cast<uint32_t>(Context.size() / 2);
+    uint32_t MaxDedupSize =
+        CSize == -1 ? HS : std::min(static_cast<uint32_t>(CSize), HS);
+    auto BeginIter = Context.begin();
+    // Use an in-place algorithm to save memory copy
+    // End indicates the end location of current iteration's data
+    uint32_t End = 0;
+    // Deduplicate from length 1 to the max possible size of a repeated
+    // sequence.
+    while (I <= MaxDedupSize) {
+      // This is a linear algorithm that deduplicates adjacent repeated
+      // sequences of size I. The deduplication detection runs on a sliding
+      // window whose size is 2*I and it keeps sliding the window to deduplicate
+      // the data inside. Once duplication is detected, deduplicate it by
+      // skipping the right half part of the window, otherwise just copy back
+      // the new one by appending them at the back of End pointer(for the next
+      // iteration).
+      //
+      // For example:
+      // Input: [a1, a2, b1, b2]
+      // (Added index to distinguish the same char, the origin is [a, a, b,
+      // b], the size of the dedup window is 2(I = 1) at the beginning)
+      //
+      // 1) The initial status is a dummy window[null, a1], then just copy the
+      // right half of the window(End = 0), then slide the window.
+      // Result: [a1], a2, b1, b2 (End points to the element right before ],
+      // after ] is the data of the previous iteration)
+      //
+      // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of
+      // the window i.e the duplication happen. Only slide the window.
+      // Result: [a1], a2, b1, b2
+      //
+      // 3) Next window is [a2, b1], copy the right half of the window(b1 is
+      // new) to the End and slide the window.
+      // Result: [a1, b1], b1, b2
+      //
+      // 4) Next window is [b1, b2], same to 2), skip b2.
+      // Result: [a1, b1], b1, b2
+      // After resize, it will be [a, b]
+
+      // Use pointers like below to do comparison inside the window
+      //    [a         b         c        a       b        c]
+      //     |         |         |                |        |
+      // LeftBoundary Left     Right           Left+I    Right+I
+      // A duplication found if Left < LeftBoundry.
+
+      int32_t Right = I - 1;
+      End = I;
+      int32_t LeftBoundary = 0;
+      while (Right + I < Context.size()) {
+        // To avoids scanning a part of a sequence repeatedly, it finds out
+        // the common suffix of two hald in the window. The common suffix will
+        // serve as the common prefix of next possible pair of duplicate
+        // sequences. The non-common part will be ignored and never scanned
+        // again.
+
+        // For example.
+        // Input: [a, b1], c1, b2, c2
+        // I = 2
+        //
+        // 1) For the window [a, b1, c1, b2], non-common-suffix for the right
+        // part is 'c1', copy it and only slide the window 1 step.
+        // Result: [a, b1, c1], b2, c2
+        //
+        // 2) Next window is [b1, c1, b2, c2], so duplication happen.
+        // Result after resize: [a, b, c]
+
+        int32_t Left = Right;
+        while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) {
+          // Find the longest suffix inside the window. When stops, Left points
+          // at the diverging point in the current sequence.
+          Left--;
+        }
+
+        bool DuplicationFound = (Left < LeftBoundary);
+        // Don't need to recheck the data before Right
+        LeftBoundary = Right + 1;
+        if (DuplicationFound) {
+          // Duplication found, skip right half of the window.
+          Right += I;
+        } else {
+          // Copy the non-common-suffix part of the adjacent sequence.
+          std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1,
+                    BeginIter + End);
+          End += Left + I - Right;
+          // Only slide the window by the size of non-common-suffix
+          Right = Left + I;
+        }
+      }
+      // Don't forget the remaining part that's not scanned.
+      std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End);
+      End += Context.size() - Right - 1;
+      I++;
+      Context.resize(End);
+      MaxDedupSize = std::min(static_cast<uint32_t>(End / 2), MaxDedupSize);
+    }
+  }
+
+private:
+  void generateLineNumBasedProfile();
+
+  FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
+                                              bool WasLeafInlined = false);
+
+  // Lookup or create ContextTrieNode for the context, FunctionSamples is
+  // created inside this function.
+  ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context,
+                                          bool WasLeafInlined = false);
+
+  // For profiled only functions, on-demand compute their inline context
+  // function byte size which is used by the pre-inliner.
+  void computeSizeForProfiledFunctions();
+  // Post processing for profiles before writing out, such as mermining
+  // and trimming cold profiles, running preinliner on profiles.
+  void postProcessProfiles();
+
+  void populateBodySamplesForFunction(FunctionSamples &FunctionProfile,
+                                      const RangeSample &RangeCounters);
+
+  void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode,
+                                          const BranchSample &BranchCounters);
+
+  void populateInferredFunctionSamples(ContextTrieNode &Node);
+
+  void updateFunctionSamples();
+
+  void generateProbeBasedProfile();
+
+  // Fill in function body samples from probes
+  void populateBodySamplesWithProbes(const RangeSample &RangeCounter,
+                                     const AddrBasedCtxKey *CtxKey);
+  // Fill in boundary samples for a call probe
+  void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter,
+                                         const AddrBasedCtxKey *CtxKey);
+
+  ContextTrieNode *
+  getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey,
+                             const MCDecodedPseudoProbe *LeafProbe);
+
+  // Helper function to get FunctionSamples for the leaf probe
+  FunctionSamples &
+  getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey,
+                                 const MCDecodedPseudoProbe *LeafProbe);
+
+  void convertToProfileMap(ContextTrieNode &Node,
+                           SampleContextFrameVector &Context);
+
+  void convertToProfileMap();
+
+  void computeSummaryAndThreshold();
+
+  bool collectFunctionsFromLLVMProfile(
+      std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override;
+
+  void initializeMissingFrameInferrer();
+
+  // Given an input `Context`, output `NewContext` with inferred missing tail
+  // call frames.
+  void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context,
+                          SmallVectorImpl<uint64_t> &NewContext);
+
+  ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); };
+
+  // The container for holding the FunctionSamples used by context trie.
+  std::list<FunctionSamples> FSamplesList;
+
+  // Underlying context table serves for sample profile writer.
+  std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts;
+
+  SampleContextTracker ContextTracker;
+
+  bool IsProfileValidOnTrie = true;
+
+public:
+  // Deduplicate adjacent repeated context sequences up to a given sequence
+  // length. -1 means no size limit.
+  static int32_t MaxCompressionSize;
+  static int MaxContextDepth;
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.cpp
new file mode 100644
index 0000000000..6847ba1b21
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.cpp
@@ -0,0 +1,1035 @@
+//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfiledBinary.h"
+#include "ErrorHandling.h"
+#include "MissingFrameInferrer.h"
+#include "ProfileGenerator.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Demangle/Demangle.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/TargetParser/Triple.h"
+#include <optional>
+
+#define DEBUG_TYPE "load-binary"
+
+using namespace llvm;
+using namespace sampleprof;
+
+cl::opt<bool> ShowDisassemblyOnly("show-disassembly-only",
+                                  cl::desc("Print disassembled code."));
+
+cl::opt<bool> ShowSourceLocations("show-source-locations",
+                                  cl::desc("Print source locations."));
+
+static cl::opt<bool>
+    ShowCanonicalFnName("show-canonical-fname",
+                        cl::desc("Print canonical function name."));
+
+static cl::opt<bool> ShowPseudoProbe(
+    "show-pseudo-probe",
+    cl::desc("Print pseudo probe section and disassembled info."));
+
+static cl::opt<bool> UseDwarfCorrelation(
+    "use-dwarf-correlation",
+    cl::desc("Use dwarf for profile correlation even when binary contains "
+             "pseudo probe."));
+
+static cl::opt<std::string>
+    DWPPath("dwp", cl::init(""),
+            cl::desc("Path of .dwp file. When not specified, it will be "
+                     "<binary>.dwp in the same directory as the main binary."));
+
+static cl::list<std::string> DisassembleFunctions(
+    "disassemble-functions", cl::CommaSeparated,
+    cl::desc("List of functions to print disassembly for. Accept demangled "
+             "names only. Only work with show-disassembly-only"));
+
+static cl::opt<bool>
+    KernelBinary("kernel",
+                 cl::desc("Generate the profile for Linux kernel binary."));
+
+extern cl::opt<bool> ShowDetailedWarning;
+extern cl::opt<bool> InferMissingFrames;
+
+namespace llvm {
+namespace sampleprof {
+
+static const Target *getTarget(const ObjectFile *Obj) {
+  Triple TheTriple = Obj->makeTriple();
+  std::string Error;
+  std::string ArchName;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
+  if (!TheTarget)
+    exitWithError(Error, Obj->getFileName());
+  return TheTarget;
+}
+
+void BinarySizeContextTracker::addInstructionForContext(
+    const SampleContextFrameVector &Context, uint32_t InstrSize) {
+  ContextTrieNode *CurNode = &RootContext;
+  bool IsLeaf = true;
+  for (const auto &Callsite : reverse(Context)) {
+    FunctionId CallerName = Callsite.Func;
+    LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location;
+    CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName);
+    IsLeaf = false;
+  }
+
+  CurNode->addFunctionSize(InstrSize);
+}
+
+uint32_t
+BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) {
+  ContextTrieNode *CurrNode = &RootContext;
+  ContextTrieNode *PrevNode = nullptr;
+
+  std::optional<uint32_t> Size;
+
+  // Start from top-level context-less function, traverse down the reverse
+  // context trie to find the best/longest match for given context, then
+  // retrieve the size.
+  LineLocation CallSiteLoc(0, 0);
+  while (CurrNode && Node->getParentContext() != nullptr) {
+    PrevNode = CurrNode;
+    CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName());
+    if (CurrNode && CurrNode->getFunctionSize())
+      Size = *CurrNode->getFunctionSize();
+    CallSiteLoc = Node->getCallSiteLoc();
+    Node = Node->getParentContext();
+  }
+
+  // If we traversed all nodes along the path of the context and haven't
+  // found a size yet, pivot to look for size from sibling nodes, i.e size
+  // of inlinee under different context.
+  if (!Size) {
+    if (!CurrNode)
+      CurrNode = PrevNode;
+    while (!Size && CurrNode && !CurrNode->getAllChildContext().empty()) {
+      CurrNode = &CurrNode->getAllChildContext().begin()->second;
+      if (CurrNode->getFunctionSize())
+        Size = *CurrNode->getFunctionSize();
+    }
+  }
+
+  assert(Size && "We should at least find one context size.");
+  return *Size;
+}
+
+void BinarySizeContextTracker::trackInlineesOptimizedAway(
+    MCPseudoProbeDecoder &ProbeDecoder) {
+  ProbeFrameStack ProbeContext;
+  for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren())
+    trackInlineesOptimizedAway(ProbeDecoder, Child, ProbeContext);
+}
+
+void BinarySizeContextTracker::trackInlineesOptimizedAway(
+    MCPseudoProbeDecoder &ProbeDecoder,
+    const MCDecodedPseudoProbeInlineTree &ProbeNode,
+    ProbeFrameStack &ProbeContext) {
+  StringRef FuncName =
+      ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName;
+  ProbeContext.emplace_back(FuncName, 0);
+
+  // This ProbeContext has a probe, so it has code before inlining and
+  // optimization. Make sure we mark its size as known.
+  if (!ProbeNode.getProbes().empty()) {
+    ContextTrieNode *SizeContext = &RootContext;
+    for (auto &ProbeFrame : reverse(ProbeContext)) {
+      StringRef CallerName = ProbeFrame.first;
+      LineLocation CallsiteLoc(ProbeFrame.second, 0);
+      SizeContext =
+          SizeContext->getOrCreateChildContext(CallsiteLoc,
+                                               FunctionId(CallerName));
+    }
+    // Add 0 size to make known.
+    SizeContext->addFunctionSize(0);
+  }
+
+  // DFS down the probe inline tree
+  for (const auto &ChildNode : ProbeNode.getChildren()) {
+    InlineSite Location = ChildNode.getInlineSite();
+    ProbeContext.back().second = std::get<1>(Location);
+    trackInlineesOptimizedAway(ProbeDecoder, ChildNode, ProbeContext);
+  }
+
+  ProbeContext.pop_back();
+}
+
+ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath,
+                               const StringRef DebugBinPath)
+    : Path(ExeBinPath), DebugBinaryPath(DebugBinPath),
+      SymbolizerOpts(getSymbolizerOpts()), ProEpilogTracker(this),
+      Symbolizer(std::make_unique<symbolize::LLVMSymbolizer>(SymbolizerOpts)),
+      TrackFuncContextSize(EnableCSPreInliner && UseContextCostForPreInliner) {
+  // Point to executable binary if debug info binary is not specified.
+  SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath;
+  if (InferMissingFrames)
+    MissingContextInferrer = std::make_unique<MissingFrameInferrer>(this);
+  load();
+}
+
+ProfiledBinary::~ProfiledBinary() {}
+
+void ProfiledBinary::warnNoFuncEntry() {
+  uint64_t NoFuncEntryNum = 0;
+  for (auto &F : BinaryFunctions) {
+    if (F.second.Ranges.empty())
+      continue;
+    bool hasFuncEntry = false;
+    for (auto &R : F.second.Ranges) {
+      if (FuncRange *FR = findFuncRangeForStartAddr(R.first)) {
+        if (FR->IsFuncEntry) {
+          hasFuncEntry = true;
+          break;
+        }
+      }
+    }
+
+    if (!hasFuncEntry) {
+      NoFuncEntryNum++;
+      if (ShowDetailedWarning)
+        WithColor::warning()
+            << "Failed to determine function entry for " << F.first
+            << " due to inconsistent name from symbol table and dwarf info.\n";
+    }
+  }
+  emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(),
+                     "of functions failed to determine function entry due to "
+                     "inconsistent name from symbol table and dwarf info.");
+}
+
+void ProfiledBinary::load() {
+  // Attempt to open the binary.
+  OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
+  Binary &ExeBinary = *OBinary.getBinary();
+
+  IsCOFF = isa<COFFObjectFile>(&ExeBinary);
+  if (!isa<ELFObjectFileBase>(&ExeBinary) && !IsCOFF)
+    exitWithError("not a valid ELF/COFF image", Path);
+
+  auto *Obj = cast<ObjectFile>(&ExeBinary);
+  TheTriple = Obj->makeTriple();
+
+  LLVM_DEBUG(dbgs() << "Loading " << Path << "\n");
+
+  // Mark the binary as a kernel image;
+  IsKernel = KernelBinary;
+
+  // Find the preferred load address for text sections.
+  setPreferredTextSegmentAddresses(Obj);
+
+  // Load debug info of subprograms from DWARF section.
+  // If path of debug info binary is specified, use the debug info from it,
+  // otherwise use the debug info from the executable binary.
+  if (!DebugBinaryPath.empty()) {
+    OwningBinary<Binary> DebugPath =
+        unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath);
+    loadSymbolsFromDWARF(*cast<ObjectFile>(DebugPath.getBinary()));
+  } else {
+    loadSymbolsFromDWARF(*cast<ObjectFile>(&ExeBinary));
+  }
+
+  DisassembleFunctionSet.insert_range(DisassembleFunctions);
+
+  if (auto *ELFObj = dyn_cast<ELFObjectFileBase>(Obj)) {
+    checkPseudoProbe(ELFObj);
+    if (UsePseudoProbes)
+      populateElfSymbolAddressList(ELFObj);
+
+    if (ShowDisassemblyOnly)
+      decodePseudoProbe(ELFObj);
+  }
+
+  // Disassemble the text sections.
+  disassemble(Obj);
+
+  // Use function start and return address to infer prolog and epilog
+  ProEpilogTracker.inferPrologAddresses(StartAddrToFuncRangeMap);
+  ProEpilogTracker.inferEpilogAddresses(RetAddressSet);
+
+  warnNoFuncEntry();
+
+  // TODO: decode other sections.
+}
+
+bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) {
+  const SampleContextFrameVector &Context1 =
+      getCachedFrameLocationStack(Address1);
+  const SampleContextFrameVector &Context2 =
+      getCachedFrameLocationStack(Address2);
+  if (Context1.size() != Context2.size())
+    return false;
+  if (Context1.empty())
+    return false;
+  // The leaf frame contains location within the leaf, and it
+  // needs to be remove that as it's not part of the calling context
+  return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1,
+                    Context2.begin(), Context2.begin() + Context2.size() - 1);
+}
+
+SampleContextFrameVector
+ProfiledBinary::getExpandedContext(const SmallVectorImpl<uint64_t> &Stack,
+                                   bool &WasLeafInlined) {
+  SampleContextFrameVector ContextVec;
+  if (Stack.empty())
+    return ContextVec;
+  // Process from frame root to leaf
+  for (auto Address : Stack) {
+    const SampleContextFrameVector &ExpandedContext =
+        getCachedFrameLocationStack(Address);
+    // An instruction without a valid debug line will be ignored by sample
+    // processing
+    if (ExpandedContext.empty())
+      return SampleContextFrameVector();
+    // Set WasLeafInlined to the size of inlined frame count for the last
+    // address which is leaf
+    WasLeafInlined = (ExpandedContext.size() > 1);
+    ContextVec.append(ExpandedContext);
+  }
+
+  // Replace with decoded base discriminator
+  for (auto &Frame : ContextVec) {
+    Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator(
+        Frame.Location.Discriminator, UseFSDiscriminator);
+  }
+
+  assert(ContextVec.size() && "Context length should be at least 1");
+
+  // Compress the context string except for the leaf frame
+  auto LeafFrame = ContextVec.back();
+  LeafFrame.Location = LineLocation(0, 0);
+  ContextVec.pop_back();
+  CSProfileGenerator::compressRecursionContext(ContextVec);
+  CSProfileGenerator::trimContext(ContextVec);
+  ContextVec.push_back(LeafFrame);
+  return ContextVec;
+}
+
+template <class ELFT>
+void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
+                                                      StringRef FileName) {
+  const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName);
+  // FIXME: This should be the page size of the system running profiling.
+  // However such info isn't available at post-processing time, assuming
+  // 4K page now. Note that we don't use EXEC_PAGESIZE from <linux/param.h>
+  // because we may build the tools on non-linux.
+  uint64_t PageSize = 0x1000;
+  for (const typename ELFT::Phdr &Phdr : PhdrRange) {
+    if (Phdr.p_type == ELF::PT_LOAD) {
+      if (!FirstLoadableAddress)
+        FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U);
+      if (Phdr.p_flags & ELF::PF_X) {
+        // Segments will always be loaded at a page boundary.
+        PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr &
+                                                ~(PageSize - 1U));
+        TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U));
+      }
+    }
+  }
+
+  if (PreferredTextSegmentAddresses.empty())
+    exitWithError("no executable segment found", FileName);
+}
+
+void ProfiledBinary::setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                                      StringRef FileName) {
+  uint64_t ImageBase = Obj->getImageBase();
+  if (!ImageBase)
+    exitWithError("Not a COFF image", FileName);
+
+  PreferredTextSegmentAddresses.push_back(ImageBase);
+  FirstLoadableAddress = ImageBase;
+
+  for (SectionRef Section : Obj->sections()) {
+    const coff_section *Sec = Obj->getCOFFSection(Section);
+    if (Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      TextSegmentOffsets.push_back(Sec->VirtualAddress);
+  }
+}
+
+void ProfiledBinary::setPreferredTextSegmentAddresses(const ObjectFile *Obj) {
+  if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *COFFObj = dyn_cast<COFFObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(COFFObj, Obj->getFileName());
+  else
+    llvm_unreachable("invalid object format");
+}
+
+void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) {
+  if (UseDwarfCorrelation)
+    return;
+
+  bool HasProbeDescSection = false;
+  bool HasPseudoProbeSection = false;
+
+  StringRef FileName = Obj->getFileName();
+  for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
+       SI != SE; ++SI) {
+    const SectionRef &Section = *SI;
+    StringRef SectionName = unwrapOrError(Section.getName(), FileName);
+    if (SectionName == ".pseudo_probe_desc") {
+      HasProbeDescSection = true;
+    } else if (SectionName == ".pseudo_probe") {
+      HasPseudoProbeSection = true;
+    }
+  }
+
+  // set UsePseudoProbes flag, used for PerfReader
+  UsePseudoProbes = HasProbeDescSection && HasPseudoProbeSection;
+}
+
+void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
+  if (!UsePseudoProbes)
+    return;
+
+  MCPseudoProbeDecoder::Uint64Set GuidFilter;
+  MCPseudoProbeDecoder::Uint64Map FuncStartAddresses;
+  if (ShowDisassemblyOnly) {
+    if (DisassembleFunctionSet.empty()) {
+      FuncStartAddresses = SymbolStartAddrs;
+    } else {
+      for (auto &F : DisassembleFunctionSet) {
+        auto GUID = Function::getGUIDAssumingExternalLinkage(F.first());
+        if (auto StartAddr = SymbolStartAddrs.lookup(GUID)) {
+          FuncStartAddresses[GUID] = StartAddr;
+          FuncRange &Range = StartAddrToFuncRangeMap[StartAddr];
+          GuidFilter.insert(
+              Function::getGUIDAssumingExternalLinkage(Range.getFuncName()));
+        }
+      }
+    }
+  } else {
+    for (auto *F : ProfiledFunctions) {
+      GuidFilter.insert(Function::getGUIDAssumingExternalLinkage(F->FuncName));
+      for (auto &Range : F->Ranges) {
+        auto GUIDs = StartAddrToSymMap.equal_range(Range.first);
+        for (const auto &[StartAddr, Func] : make_range(GUIDs))
+          FuncStartAddresses[Func] = StartAddr;
+      }
+    }
+  }
+
+  StringRef FileName = Obj->getFileName();
+  for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
+       SI != SE; ++SI) {
+    const SectionRef &Section = *SI;
+    StringRef SectionName = unwrapOrError(Section.getName(), FileName);
+
+    if (SectionName == ".pseudo_probe_desc") {
+      StringRef Contents = unwrapOrError(Section.getContents(), FileName);
+      if (!ProbeDecoder.buildGUID2FuncDescMap(
+              reinterpret_cast<const uint8_t *>(Contents.data()),
+              Contents.size()))
+        exitWithError(
+            "Pseudo Probe decoder fail in .pseudo_probe_desc section");
+    } else if (SectionName == ".pseudo_probe") {
+      StringRef Contents = unwrapOrError(Section.getContents(), FileName);
+      if (!ProbeDecoder.buildAddress2ProbeMap(
+              reinterpret_cast<const uint8_t *>(Contents.data()),
+              Contents.size(), GuidFilter, FuncStartAddresses))
+        exitWithError("Pseudo Probe decoder fail in .pseudo_probe section");
+    }
+  }
+
+  // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
+  // is available
+  if (TrackFuncContextSize) {
+    for (auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
+      auto *Frame = &Child;
+      StringRef FuncName =
+          ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
+      TopLevelProbeFrameMap[FuncName] = Frame;
+    }
+  }
+
+  if (ShowPseudoProbe)
+    ProbeDecoder.printGUID2FuncDescMap(outs());
+}
+
+void ProfiledBinary::decodePseudoProbe() {
+  OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
+  Binary &ExeBinary = *OBinary.getBinary();
+  auto *Obj = cast<ELFObjectFileBase>(&ExeBinary);
+  decodePseudoProbe(Obj);
+}
+
+void ProfiledBinary::setIsFuncEntry(FuncRange *FuncRange,
+                                    StringRef RangeSymName) {
+  // Skip external function symbol.
+  if (!FuncRange)
+    return;
+
+  // Set IsFuncEntry to ture if there is only one range in the function or the
+  // RangeSymName from ELF is equal to its DWARF-based function name.
+  if (FuncRange->Func->Ranges.size() == 1 ||
+      (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName))
+    FuncRange->IsFuncEntry = true;
+}
+
+bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
+                                        SectionSymbolsTy &Symbols,
+                                        const SectionRef &Section) {
+  std::size_t SE = Symbols.size();
+  uint64_t SectionAddress = Section.getAddress();
+  uint64_t SectSize = Section.getSize();
+  uint64_t StartAddress = Symbols[SI].Addr;
+  uint64_t NextStartAddress =
+      (SI + 1 < SE) ? Symbols[SI + 1].Addr : SectionAddress + SectSize;
+  FuncRange *FRange = findFuncRange(StartAddress);
+  setIsFuncEntry(FRange, FunctionSamples::getCanonicalFnName(Symbols[SI].Name));
+  StringRef SymbolName =
+      ShowCanonicalFnName
+          ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name)
+          : Symbols[SI].Name;
+  bool ShowDisassembly =
+      ShowDisassemblyOnly && (DisassembleFunctionSet.empty() ||
+                              DisassembleFunctionSet.count(SymbolName));
+  if (ShowDisassembly)
+    outs() << '<' << SymbolName << ">:\n";
+
+  uint64_t Address = StartAddress;
+  // Size of a consecutive invalid instruction range starting from Address -1
+  // backwards.
+  uint64_t InvalidInstLength = 0;
+  while (Address < NextStartAddress) {
+    MCInst Inst;
+    uint64_t Size;
+    // Disassemble an instruction.
+    bool Disassembled = DisAsm->getInstruction(
+        Inst, Size, Bytes.slice(Address - SectionAddress), Address, nulls());
+    if (Size == 0)
+      Size = 1;
+
+    if (ShowDisassembly) {
+      if (ShowPseudoProbe) {
+        ProbeDecoder.printProbeForAddress(outs(), Address);
+      }
+      outs() << format("%8" PRIx64 ":", Address);
+      size_t Start = outs().tell();
+      if (Disassembled)
+        IPrinter->printInst(&Inst, Address + Size, "", *STI, outs());
+      else
+        outs() << "\t<unknown>";
+      if (ShowSourceLocations) {
+        unsigned Cur = outs().tell() - Start;
+        if (Cur < 40)
+          outs().indent(40 - Cur);
+        InstructionPointer IP(this, Address);
+        outs() << getReversedLocWithContext(
+            symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe));
+      }
+      outs() << "\n";
+    }
+
+    if (Disassembled) {
+      const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode());
+
+      // Record instruction size.
+      AddressToInstSizeMap[Address] = Size;
+
+      // Populate address maps.
+      CodeAddressVec.push_back(Address);
+      if (MCDesc.isCall()) {
+        CallAddressSet.insert(Address);
+        UncondBranchAddrSet.insert(Address);
+      } else if (MCDesc.isReturn()) {
+        RetAddressSet.insert(Address);
+        UncondBranchAddrSet.insert(Address);
+      } else if (MCDesc.isBranch()) {
+        if (MCDesc.isUnconditionalBranch())
+          UncondBranchAddrSet.insert(Address);
+        BranchAddressSet.insert(Address);
+      }
+
+      // Record potential call targets for tail frame inference later-on.
+      if (InferMissingFrames && FRange) {
+        uint64_t Target = 0;
+        MIA->evaluateBranch(Inst, Address, Size, Target);
+        if (MCDesc.isCall()) {
+          // Indirect call targets are unknown at this point. Recording the
+          // unknown target (zero) for further LBR-based refinement.
+          MissingContextInferrer->CallEdges[Address].insert(Target);
+        } else if (MCDesc.isUnconditionalBranch()) {
+          assert(Target &&
+                 "target should be known for unconditional direct branch");
+          // Any inter-function unconditional jump is considered tail call at
+          // this point. This is not 100% accurate and could further be
+          // optimized based on some source annotation.
+          FuncRange *ToFRange = findFuncRange(Target);
+          if (ToFRange && ToFRange->Func != FRange->Func)
+            MissingContextInferrer->TailCallEdges[Address].insert(Target);
+          LLVM_DEBUG({
+            dbgs() << "Direct Tail call: " << format("%8" PRIx64 ":", Address);
+            IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs());
+            dbgs() << "\n";
+          });
+        } else if (MCDesc.isIndirectBranch() && MCDesc.isBarrier()) {
+          // This is an indirect branch but not necessarily an indirect tail
+          // call. The isBarrier check is to filter out conditional branch.
+          // Similar with indirect call targets, recording the unknown target
+          // (zero) for further LBR-based refinement.
+          MissingContextInferrer->TailCallEdges[Address].insert(Target);
+          LLVM_DEBUG({
+            dbgs() << "Indirect Tail call: "
+                   << format("%8" PRIx64 ":", Address);
+            IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs());
+            dbgs() << "\n";
+          });
+        }
+      }
+
+      if (InvalidInstLength) {
+        AddrsWithInvalidInstruction.insert(
+            {Address - InvalidInstLength, Address - 1});
+        InvalidInstLength = 0;
+      }
+    } else {
+      InvalidInstLength += Size;
+    }
+
+    Address += Size;
+  }
+
+  if (InvalidInstLength)
+    AddrsWithInvalidInstruction.insert(
+        {Address - InvalidInstLength, Address - 1});
+
+  if (ShowDisassembly)
+    outs() << "\n";
+
+  return true;
+}
+
+void ProfiledBinary::setUpDisassembler(const ObjectFile *Obj) {
+  const Target *TheTarget = getTarget(Obj);
+  std::string TripleName = TheTriple.getTriple();
+  StringRef FileName = Obj->getFileName();
+
+  MRI.reset(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI)
+    exitWithError("no register info for target " + TripleName, FileName);
+
+  MCTargetOptions MCOptions;
+  AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  if (!AsmInfo)
+    exitWithError("no assembly info for target " + TripleName, FileName);
+
+  Expected<SubtargetFeatures> Features = Obj->getFeatures();
+  if (!Features)
+    exitWithError(Features.takeError(), FileName);
+  STI.reset(
+      TheTarget->createMCSubtargetInfo(TripleName, "", Features->getString()));
+  if (!STI)
+    exitWithError("no subtarget info for target " + TripleName, FileName);
+
+  MII.reset(TheTarget->createMCInstrInfo());
+  if (!MII)
+    exitWithError("no instruction info for target " + TripleName, FileName);
+
+  MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get());
+  std::unique_ptr<MCObjectFileInfo> MOFI(
+      TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false));
+  Ctx.setObjectFileInfo(MOFI.get());
+  DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx));
+  if (!DisAsm)
+    exitWithError("no disassembler for target " + TripleName, FileName);
+
+  MIA.reset(TheTarget->createMCInstrAnalysis(MII.get()));
+
+  int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
+  IPrinter.reset(TheTarget->createMCInstPrinter(
+      Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI));
+  IPrinter->setPrintBranchImmAsAddress(true);
+}
+
+void ProfiledBinary::disassemble(const ObjectFile *Obj) {
+  // Set up disassembler and related components.
+  setUpDisassembler(Obj);
+
+  // Create a mapping from virtual address to symbol name. The symbols in text
+  // sections are the candidates to dissassemble.
+  std::map<SectionRef, SectionSymbolsTy> AllSymbols;
+  StringRef FileName = Obj->getFileName();
+  for (const SymbolRef &Symbol : Obj->symbols()) {
+    const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName);
+    const StringRef Name = unwrapOrError(Symbol.getName(), FileName);
+    section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName);
+    if (SecI != Obj->section_end())
+      AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE));
+  }
+
+  // Sort all the symbols. Use a stable sort to stabilize the output.
+  for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
+    stable_sort(SecSyms.second);
+
+  assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) &&
+         "Functions to disassemble should be only specified together with "
+         "--show-disassembly-only");
+
+  if (ShowDisassemblyOnly)
+    outs() << "\nDisassembly of " << FileName << ":\n";
+
+  // Dissassemble a text section.
+  for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
+       SI != SE; ++SI) {
+    const SectionRef &Section = *SI;
+    if (!Section.isText())
+      continue;
+
+    uint64_t ImageLoadAddr = getPreferredBaseAddress();
+    uint64_t SectionAddress = Section.getAddress() - ImageLoadAddr;
+    uint64_t SectSize = Section.getSize();
+    if (!SectSize)
+      continue;
+
+    // Register the text section.
+    TextSections.insert({SectionAddress, SectSize});
+
+    StringRef SectionName = unwrapOrError(Section.getName(), FileName);
+
+    if (ShowDisassemblyOnly) {
+      outs() << "\nDisassembly of section " << SectionName;
+      outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", "
+             << format("0x%" PRIx64, Section.getAddress() + SectSize)
+             << "]:\n\n";
+    }
+
+    if (isa<ELFObjectFileBase>(Obj) && SectionName == ".plt")
+      continue;
+
+    // Get the section data.
+    ArrayRef<uint8_t> Bytes =
+        arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName));
+
+    // Get the list of all the symbols in this section.
+    SectionSymbolsTy &Symbols = AllSymbols[Section];
+
+    // Disassemble symbol by symbol.
+    for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) {
+      if (!dissassembleSymbol(SI, Bytes, Symbols, Section))
+        exitWithError("disassembling error", FileName);
+    }
+  }
+
+  if (!AddrsWithInvalidInstruction.empty()) {
+    if (ShowDetailedWarning) {
+      for (auto &Addr : AddrsWithInvalidInstruction) {
+        WithColor::warning()
+            << "Invalid instructions at " << format("%8" PRIx64, Addr.first)
+            << " - " << format("%8" PRIx64, Addr.second) << "\n";
+      }
+    }
+    WithColor::warning() << "Found " << AddrsWithInvalidInstruction.size()
+                         << " invalid instructions\n";
+    AddrsWithInvalidInstruction.clear();
+  }
+
+  // Dissassemble rodata section to check if FS discriminator symbol exists.
+  checkUseFSDiscriminator(Obj, AllSymbols);
+}
+
+void ProfiledBinary::checkUseFSDiscriminator(
+    const ObjectFile *Obj, std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
+  const char *FSDiscriminatorVar = "__llvm_fs_discriminator__";
+  for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
+       SI != SE; ++SI) {
+    const SectionRef &Section = *SI;
+    if (!Section.isData() || Section.getSize() == 0)
+      continue;
+    SectionSymbolsTy &Symbols = AllSymbols[Section];
+
+    for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) {
+      if (Symbols[SI].Name == FSDiscriminatorVar) {
+        UseFSDiscriminator = true;
+        return;
+      }
+    }
+  }
+}
+
+void ProfiledBinary::populateElfSymbolAddressList(
+    const ELFObjectFileBase *Obj) {
+  // Create a mapping from virtual address to symbol GUID and the other way
+  // around.
+  StringRef FileName = Obj->getFileName();
+  for (const SymbolRef &Symbol : Obj->symbols()) {
+    const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName);
+    const StringRef Name = unwrapOrError(Symbol.getName(), FileName);
+    uint64_t GUID = Function::getGUIDAssumingExternalLinkage(Name);
+    SymbolStartAddrs[GUID] = Addr;
+    StartAddrToSymMap.emplace(Addr, GUID);
+  }
+}
+
+void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) {
+  for (const auto &DieInfo : CompilationUnit.dies()) {
+    llvm::DWARFDie Die(&CompilationUnit, &DieInfo);
+
+    if (!Die.isSubprogramDIE())
+      continue;
+    auto Name = Die.getName(llvm::DINameKind::LinkageName);
+    if (!Name)
+      Name = Die.getName(llvm::DINameKind::ShortName);
+    if (!Name)
+      continue;
+
+    auto RangesOrError = Die.getAddressRanges();
+    if (!RangesOrError)
+      continue;
+    const DWARFAddressRangesVector &Ranges = RangesOrError.get();
+
+    if (Ranges.empty())
+      continue;
+
+    // Different DWARF symbols can have same function name, search or create
+    // BinaryFunction indexed by the name.
+    auto Ret = BinaryFunctions.emplace(Name, BinaryFunction());
+    auto &Func = Ret.first->second;
+    if (Ret.second)
+      Func.FuncName = Ret.first->first;
+
+    for (const auto &Range : Ranges) {
+      uint64_t StartAddress = Range.LowPC;
+      uint64_t EndAddress = Range.HighPC;
+
+      if (EndAddress <= StartAddress ||
+          StartAddress < getPreferredBaseAddress())
+        continue;
+
+      // We may want to know all ranges for one function. Here group the
+      // ranges and store them into BinaryFunction.
+      Func.Ranges.emplace_back(StartAddress, EndAddress);
+
+      auto R = StartAddrToFuncRangeMap.emplace(StartAddress, FuncRange());
+      if (R.second) {
+        FuncRange &FRange = R.first->second;
+        FRange.Func = &Func;
+        FRange.StartAddress = StartAddress;
+        FRange.EndAddress = EndAddress;
+      } else {
+        AddrsWithMultipleSymbols.insert(StartAddress);
+        if (ShowDetailedWarning)
+          WithColor::warning()
+              << "Duplicated symbol start address at "
+              << format("%8" PRIx64, StartAddress) << " "
+              << R.first->second.getFuncName() << " and " << Name << "\n";
+      }
+    }
+  }
+}
+
+void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) {
+  auto DebugContext = llvm::DWARFContext::create(
+      Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath);
+  if (!DebugContext)
+    exitWithError("Error creating the debug info context", Path);
+
+  for (const auto &CompilationUnit : DebugContext->compile_units())
+    loadSymbolsFromDWARFUnit(*CompilationUnit);
+
+  // Handles DWO sections that can either be in .o, .dwo or .dwp files.
+  uint32_t NumOfDWOMissing = 0;
+  for (const auto &CompilationUnit : DebugContext->compile_units()) {
+    DWARFUnit *const DwarfUnit = CompilationUnit.get();
+    if (DwarfUnit->getDWOId()) {
+      DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit();
+      if (!DWOCU->isDWOUnit()) {
+        NumOfDWOMissing++;
+        if (ShowDetailedWarning) {
+          std::string DWOName = dwarf::toString(
+              DwarfUnit->getUnitDIE().find(
+                  {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+              "");
+          WithColor::warning() << "DWO debug information for " << DWOName
+                               << " was not loaded.\n";
+        }
+        continue;
+      }
+      loadSymbolsFromDWARFUnit(*DWOCU);
+    }
+  }
+
+  if (NumOfDWOMissing)
+    WithColor::warning()
+        << " DWO debug information was not loaded for " << NumOfDWOMissing
+        << " modules. Please check the .o, .dwo or .dwp path.\n";
+  if (BinaryFunctions.empty())
+    WithColor::warning() << "Loading of DWARF info completed, but no binary "
+                            "functions have been retrieved.\n";
+  // Populate the hash binary function map for MD5 function name lookup. This
+  // is done after BinaryFunctions are finalized.
+  for (auto &BinaryFunction : BinaryFunctions) {
+    HashBinaryFunctions[MD5Hash(StringRef(BinaryFunction.first))] =
+        &BinaryFunction.second;
+  }
+
+  if (!AddrsWithMultipleSymbols.empty()) {
+    WithColor::warning() << "Found " << AddrsWithMultipleSymbols.size()
+                         << " start addresses with multiple symbols\n";
+    AddrsWithMultipleSymbols.clear();
+  }
+}
+
+void ProfiledBinary::populateSymbolListFromDWARF(
+    ProfileSymbolList &SymbolList) {
+  for (auto &I : StartAddrToFuncRangeMap)
+    SymbolList.add(I.second.getFuncName());
+}
+
+symbolize::LLVMSymbolizer::Options ProfiledBinary::getSymbolizerOpts() const {
+  symbolize::LLVMSymbolizer::Options SymbolizerOpts;
+  SymbolizerOpts.PrintFunctions =
+      DILineInfoSpecifier::FunctionNameKind::LinkageName;
+  SymbolizerOpts.Demangle = false;
+  SymbolizerOpts.DefaultArch = TheTriple.getArchName().str();
+  SymbolizerOpts.UseSymbolTable = false;
+  SymbolizerOpts.RelativeAddresses = false;
+  SymbolizerOpts.DWPName = DWPPath;
+  return SymbolizerOpts;
+}
+
+SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP,
+                                                   bool UseCanonicalFnName,
+                                                   bool UseProbeDiscriminator) {
+  assert(this == IP.Binary &&
+         "Binary should only symbolize its own instruction");
+  auto Addr = object::SectionedAddress{IP.Address,
+                                       object::SectionedAddress::UndefSection};
+  DIInliningInfo InlineStack = unwrapOrError(
+      Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr),
+      SymbolizerPath);
+
+  SampleContextFrameVector CallStack;
+  for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) {
+    const auto &CallerFrame = InlineStack.getFrame(I);
+    if (CallerFrame.FunctionName.empty() ||
+        (CallerFrame.FunctionName == "<invalid>"))
+      break;
+
+    StringRef FunctionName(CallerFrame.FunctionName);
+    if (UseCanonicalFnName)
+      FunctionName = FunctionSamples::getCanonicalFnName(FunctionName);
+
+    uint32_t Discriminator = CallerFrame.Discriminator;
+    uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff;
+    if (UseProbeDiscriminator) {
+      LineOffset =
+          PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
+      Discriminator = 0;
+    }
+
+    LineLocation Line(LineOffset, Discriminator);
+    auto It = NameStrings.insert(FunctionName.str());
+    CallStack.emplace_back(FunctionId(StringRef(*It.first)), Line);
+  }
+
+  return CallStack;
+}
+
+void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t RangeBegin,
+                                                       uint64_t RangeEnd) {
+  InstructionPointer IP(this, RangeBegin, true);
+
+  if (IP.Address != RangeBegin)
+    WithColor::warning() << "Invalid start instruction at "
+                         << format("%8" PRIx64, RangeBegin) << "\n";
+
+  if (IP.Address >= RangeEnd)
+    return;
+
+  do {
+    const SampleContextFrameVector SymbolizedCallStack =
+        getFrameLocationStack(IP.Address, UsePseudoProbes);
+    uint64_t Size = AddressToInstSizeMap[IP.Address];
+    // Record instruction size for the corresponding context
+    FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size);
+
+  } while (IP.advance() && IP.Address < RangeEnd);
+}
+
+void ProfiledBinary::computeInlinedContextSizeForFunc(
+    const BinaryFunction *Func) {
+  // Note that a function can be spilt into multiple ranges, so compute for all
+  // ranges of the function.
+  for (const auto &Range : Func->Ranges)
+    computeInlinedContextSizeForRange(Range.first, Range.second);
+
+  // Track optimized-away inlinee for probed binary. A function inlined and then
+  // optimized away should still have their probes left over in places.
+  if (usePseudoProbes()) {
+    auto I = TopLevelProbeFrameMap.find(Func->FuncName);
+    if (I != TopLevelProbeFrameMap.end()) {
+      BinarySizeContextTracker::ProbeFrameStack ProbeContext;
+      FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second,
+                                                 ProbeContext);
+    }
+  }
+}
+
+void ProfiledBinary::inferMissingFrames(
+    const SmallVectorImpl<uint64_t> &Context,
+    SmallVectorImpl<uint64_t> &NewContext) {
+  MissingContextInferrer->inferMissingFrames(Context, NewContext);
+}
+
+InstructionPointer::InstructionPointer(const ProfiledBinary *Binary,
+                                       uint64_t Address, bool RoundToNext)
+    : Binary(Binary), Address(Address) {
+  Index = Binary->getIndexForAddr(Address);
+  if (RoundToNext) {
+    // we might get address which is not the code
+    // it should round to the next valid address
+    if (Index >= Binary->getCodeAddrVecSize())
+      this->Address = UINT64_MAX;
+    else
+      this->Address = Binary->getAddressforIndex(Index);
+  }
+}
+
+bool InstructionPointer::advance() {
+  Index++;
+  if (Index >= Binary->getCodeAddrVecSize()) {
+    Address = UINT64_MAX;
+    return false;
+  }
+  Address = Binary->getAddressforIndex(Index);
+  return true;
+}
+
+bool InstructionPointer::backward() {
+  if (Index == 0) {
+    Address = 0;
+    return false;
+  }
+  Index--;
+  Address = Binary->getAddressforIndex(Index);
+  return true;
+}
+
+void InstructionPointer::update(uint64_t Addr) {
+  Address = Addr;
+  Index = Binary->getIndexForAddr(Address);
+}
+
+} // end namespace sampleprof
+} // end namespace llvm
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.h
new file mode 100644
index 0000000000..0588cb48b2
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/ProfiledBinary.h
@@ -0,0 +1,620 @@
+//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H
+#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H
+
+#include "CallContext.h"
+#include "ErrorHandling.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace llvm {
+extern cl::opt<bool> EnableCSPreInliner;
+extern cl::opt<bool> UseContextCostForPreInliner;
+} // namespace llvm
+
+using namespace llvm;
+using namespace sampleprof;
+using namespace llvm::object;
+
+namespace llvm {
+namespace sampleprof {
+
+class ProfiledBinary;
+class MissingFrameInferrer;
+
+struct InstructionPointer {
+  const ProfiledBinary *Binary;
+  // Address of the executable segment of the binary.
+  uint64_t Address;
+  // Index to the sorted code address array of the binary.
+  uint64_t Index = 0;
+  InstructionPointer(const ProfiledBinary *Binary, uint64_t Address,
+                     bool RoundToNext = false);
+  bool advance();
+  bool backward();
+  void update(uint64_t Addr);
+};
+
+// The special frame addresses.
+enum SpecialFrameAddr {
+  // Dummy root of frame trie.
+  DummyRoot = 0,
+  // Represent all the addresses outside of current binary.
+  // This's also used to indicate the call stack should be truncated since this
+  // isn't a real call context the compiler will see.
+  ExternalAddr = 1,
+};
+
+using RangesTy = std::vector<std::pair<uint64_t, uint64_t>>;
+
+struct BinaryFunction {
+  StringRef FuncName;
+  // End of range is an exclusive bound.
+  RangesTy Ranges;
+
+  uint64_t getFuncSize() {
+    uint64_t Sum = 0;
+    for (auto &R : Ranges) {
+      Sum += R.second - R.first;
+    }
+    return Sum;
+  }
+};
+
+// Info about function range. A function can be split into multiple
+// non-continuous ranges, each range corresponds to one FuncRange.
+struct FuncRange {
+  uint64_t StartAddress;
+  // EndAddress is an exclusive bound.
+  uint64_t EndAddress;
+  // Function the range belongs to
+  BinaryFunction *Func;
+  // Whether the start address is the real entry of the function.
+  bool IsFuncEntry = false;
+
+  StringRef getFuncName() { return Func->FuncName; }
+};
+
+// PrologEpilog address tracker, used to filter out broken stack samples
+// Currently we use a heuristic size (two) to infer prolog and epilog
+// based on the start address and return address. In the future,
+// we will switch to Dwarf CFI based tracker
+struct PrologEpilogTracker {
+  // A set of prolog and epilog addresses. Used by virtual unwinding.
+  std::unordered_set<uint64_t> PrologEpilogSet;
+  ProfiledBinary *Binary;
+  PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){};
+
+  // Take the two addresses from the start of function as prolog
+  void
+  inferPrologAddresses(std::map<uint64_t, FuncRange> &FuncStartAddressMap) {
+    for (auto I : FuncStartAddressMap) {
+      PrologEpilogSet.insert(I.first);
+      InstructionPointer IP(Binary, I.first);
+      if (!IP.advance())
+        break;
+      PrologEpilogSet.insert(IP.Address);
+    }
+  }
+
+  // Take the last two addresses before the return address as epilog
+  void inferEpilogAddresses(std::unordered_set<uint64_t> &RetAddrs) {
+    for (auto Addr : RetAddrs) {
+      PrologEpilogSet.insert(Addr);
+      InstructionPointer IP(Binary, Addr);
+      if (!IP.backward())
+        break;
+      PrologEpilogSet.insert(IP.Address);
+    }
+  }
+};
+
+// Track function byte size under different context (outlined version as well as
+// various inlined versions). It also provides query support to get function
+// size with the best matching context, which is used to help pre-inliner use
+// accurate post-optimization size to make decisions.
+// TODO: If an inlinee is completely optimized away, ideally we should have zero
+// for its context size, currently we would misss such context since it doesn't
+// have instructions. To fix this, we need to mark all inlinee with entry probe
+// but without instructions as having zero size.
+class BinarySizeContextTracker {
+public:
+  // Add instruction with given size to a context
+  void addInstructionForContext(const SampleContextFrameVector &Context,
+                                uint32_t InstrSize);
+
+  // Get function size with a specific context. When there's no exact match
+  // for the given context, try to retrieve the size of that function from
+  // closest matching context.
+  uint32_t getFuncSizeForContext(const ContextTrieNode *Context);
+
+  // For inlinees that are full optimized away, we can establish zero size using
+  // their remaining probes.
+  void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder);
+
+  using ProbeFrameStack = SmallVector<std::pair<StringRef, uint32_t>>;
+  void
+  trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
+                             const MCDecodedPseudoProbeInlineTree &ProbeNode,
+                             ProbeFrameStack &Context);
+
+  void dump() { RootContext.dumpTree(); }
+
+private:
+  // Root node for context trie tree, node that this is a reverse context trie
+  // with callee as parent and caller as child. This way we can traverse from
+  // root to find the best/longest matching context if an exact match does not
+  // exist. It gives us the best possible estimate for function's post-inline,
+  // post-optimization byte size.
+  ContextTrieNode RootContext;
+};
+
+using AddressRange = std::pair<uint64_t, uint64_t>;
+
+class ProfiledBinary {
+  // Absolute path of the executable binary.
+  std::string Path;
+  // Path of the debug info binary.
+  std::string DebugBinaryPath;
+  // The target triple.
+  Triple TheTriple;
+  // Path of symbolizer path which should be pointed to binary with debug info.
+  StringRef SymbolizerPath;
+  // Options used to configure the symbolizer
+  symbolize::LLVMSymbolizer::Options SymbolizerOpts;
+  // The runtime base address that the first executable segment is loaded at.
+  uint64_t BaseAddress = 0;
+  // The runtime base address that the first loadabe segment is loaded at.
+  uint64_t FirstLoadableAddress = 0;
+  // The preferred load address of each executable segment.
+  std::vector<uint64_t> PreferredTextSegmentAddresses;
+  // The file offset of each executable segment.
+  std::vector<uint64_t> TextSegmentOffsets;
+
+  // Mutiple MC component info
+  std::unique_ptr<const MCRegisterInfo> MRI;
+  std::unique_ptr<const MCAsmInfo> AsmInfo;
+  std::unique_ptr<const MCSubtargetInfo> STI;
+  std::unique_ptr<const MCInstrInfo> MII;
+  std::unique_ptr<MCDisassembler> DisAsm;
+  std::unique_ptr<const MCInstrAnalysis> MIA;
+  std::unique_ptr<MCInstPrinter> IPrinter;
+  // A list of text sections sorted by start RVA and size. Used to check
+  // if a given RVA is a valid code address.
+  std::set<std::pair<uint64_t, uint64_t>> TextSections;
+
+  // A map of mapping function name to BinaryFunction info.
+  std::unordered_map<std::string, BinaryFunction> BinaryFunctions;
+
+  // Lookup BinaryFunctions using the function name's MD5 hash. Needed if the
+  // profile is using MD5.
+  std::unordered_map<uint64_t, BinaryFunction *> HashBinaryFunctions;
+
+  // A list of binary functions that have samples.
+  std::unordered_set<const BinaryFunction *> ProfiledFunctions;
+
+  // GUID to Elf symbol start address map
+  DenseMap<uint64_t, uint64_t> SymbolStartAddrs;
+
+  // These maps are for temporary use of warning diagnosis.
+  DenseSet<int64_t> AddrsWithMultipleSymbols;
+  DenseSet<std::pair<uint64_t, uint64_t>> AddrsWithInvalidInstruction;
+
+  // Start address to Elf symbol GUID map
+  std::unordered_multimap<uint64_t, uint64_t> StartAddrToSymMap;
+
+  // An ordered map of mapping function's start address to function range
+  // relevant info. Currently to determine if the offset of ELF is the start of
+  // a real function, we leverage the function range info from DWARF.
+  std::map<uint64_t, FuncRange> StartAddrToFuncRangeMap;
+
+  // Address to context location map. Used to expand the context.
+  std::unordered_map<uint64_t, SampleContextFrameVector> AddressToLocStackMap;
+
+  // Address to instruction size map. Also used for quick Address lookup.
+  std::unordered_map<uint64_t, uint64_t> AddressToInstSizeMap;
+
+  // An array of Addresses of all instructions sorted in increasing order. The
+  // sorting is needed to fast advance to the next forward/backward instruction.
+  std::vector<uint64_t> CodeAddressVec;
+  // A set of call instruction addresses. Used by virtual unwinding.
+  std::unordered_set<uint64_t> CallAddressSet;
+  // A set of return instruction addresses. Used by virtual unwinding.
+  std::unordered_set<uint64_t> RetAddressSet;
+  // An ordered set of unconditional branch instruction addresses.
+  std::set<uint64_t> UncondBranchAddrSet;
+  // A set of branch instruction addresses.
+  std::unordered_set<uint64_t> BranchAddressSet;
+
+  // Estimate and track function prolog and epilog ranges.
+  PrologEpilogTracker ProEpilogTracker;
+
+  // Infer missing frames due to compiler optimizations such as tail call
+  // elimination.
+  std::unique_ptr<MissingFrameInferrer> MissingContextInferrer;
+
+  // Track function sizes under different context
+  BinarySizeContextTracker FuncSizeTracker;
+
+  // The symbolizer used to get inline context for an instruction.
+  std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
+
+  // String table owning function name strings created from the symbolizer.
+  std::unordered_set<std::string> NameStrings;
+
+  // A collection of functions to print disassembly for.
+  StringSet<> DisassembleFunctionSet;
+
+  // Pseudo probe decoder
+  MCPseudoProbeDecoder ProbeDecoder;
+
+  // Function name to probe frame map for top-level outlined functions.
+  StringMap<MCDecodedPseudoProbeInlineTree *> TopLevelProbeFrameMap;
+
+  bool UsePseudoProbes = false;
+
+  bool UseFSDiscriminator = false;
+
+  // Whether we need to symbolize all instructions to get function context size.
+  bool TrackFuncContextSize = false;
+
+  // Whether this is a kernel image;
+  bool IsKernel = false;
+
+  // Indicate if the base loading address is parsed from the mmap event or uses
+  // the preferred address
+  bool IsLoadedByMMap = false;
+  // Use to avoid redundant warning.
+  bool MissingMMapWarned = false;
+
+  bool IsCOFF = false;
+
+  void setPreferredTextSegmentAddresses(const ObjectFile *O);
+
+  template <class ELFT>
+  void setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
+                                        StringRef FileName);
+  void setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                        StringRef FileName);
+
+  void checkPseudoProbe(const ELFObjectFileBase *Obj);
+
+  void decodePseudoProbe(const ELFObjectFileBase *Obj);
+
+  void
+  checkUseFSDiscriminator(const ObjectFile *Obj,
+                          std::map<SectionRef, SectionSymbolsTy> &AllSymbols);
+
+  // Set up disassembler and related components.
+  void setUpDisassembler(const ObjectFile *Obj);
+  symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const;
+
+  // Load debug info of subprograms from DWARF section.
+  void loadSymbolsFromDWARF(ObjectFile &Obj);
+
+  // Load debug info from DWARF unit.
+  void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit);
+
+  // Create elf symbol to its start address mapping.
+  void populateElfSymbolAddressList(const ELFObjectFileBase *O);
+
+  // A function may be spilt into multiple non-continuous address ranges. We use
+  // this to set whether start a function range is the real entry of the
+  // function and also set false to the non-function label.
+  void setIsFuncEntry(FuncRange *FRange, StringRef RangeSymName);
+
+  // Warn if no entry range exists in the function.
+  void warnNoFuncEntry();
+
+  /// Dissassemble the text section and build various address maps.
+  void disassemble(const ObjectFile *O);
+
+  /// Helper function to dissassemble the symbol and extract info for unwinding
+  bool dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
+                          SectionSymbolsTy &Symbols, const SectionRef &Section);
+  /// Symbolize a given instruction pointer and return a full call context.
+  SampleContextFrameVector symbolize(const InstructionPointer &IP,
+                                     bool UseCanonicalFnName = false,
+                                     bool UseProbeDiscriminator = false);
+  /// Decode the interesting parts of the binary and build internal data
+  /// structures. On high level, the parts of interest are:
+  ///   1. Text sections, including the main code section and the PLT
+  ///   entries that will be used to handle cross-module call transitions.
+  ///   2. The .debug_line section, used by Dwarf-based profile generation.
+  ///   3. Pseudo probe related sections, used by probe-based profile
+  ///   generation.
+  void load();
+
+public:
+  ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath);
+  ~ProfiledBinary();
+
+  void decodePseudoProbe();
+
+  StringRef getPath() const { return Path; }
+  StringRef getName() const { return llvm::sys::path::filename(Path); }
+  uint64_t getBaseAddress() const { return BaseAddress; }
+  void setBaseAddress(uint64_t Address) { BaseAddress = Address; }
+
+  bool isCOFF() const { return IsCOFF; }
+
+  // Canonicalize to use preferred load address as base address.
+  uint64_t canonicalizeVirtualAddress(uint64_t Address) {
+    return Address - BaseAddress + getPreferredBaseAddress();
+  }
+  // Return the preferred load address for the first executable segment.
+  uint64_t getPreferredBaseAddress() const {
+    return PreferredTextSegmentAddresses[0];
+  }
+  // Return the preferred load address for the first loadable segment.
+  uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; }
+  // Return the file offset for the first executable segment.
+  uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; }
+  const std::vector<uint64_t> &getPreferredTextSegmentAddresses() const {
+    return PreferredTextSegmentAddresses;
+  }
+  const std::vector<uint64_t> &getTextSegmentOffsets() const {
+    return TextSegmentOffsets;
+  }
+
+  uint64_t getInstSize(uint64_t Address) const {
+    auto I = AddressToInstSizeMap.find(Address);
+    if (I == AddressToInstSizeMap.end())
+      return 0;
+    return I->second;
+  }
+
+  bool addressIsCode(uint64_t Address) const {
+    return AddressToInstSizeMap.find(Address) != AddressToInstSizeMap.end();
+  }
+
+  bool addressIsCall(uint64_t Address) const {
+    return CallAddressSet.count(Address);
+  }
+  bool addressIsReturn(uint64_t Address) const {
+    return RetAddressSet.count(Address);
+  }
+  bool addressInPrologEpilog(uint64_t Address) const {
+    return ProEpilogTracker.PrologEpilogSet.count(Address);
+  }
+
+  bool addressIsTransfer(uint64_t Address) {
+    return BranchAddressSet.count(Address) || RetAddressSet.count(Address) ||
+           CallAddressSet.count(Address);
+  }
+
+  bool rangeCrossUncondBranch(uint64_t Start, uint64_t End) {
+    if (Start >= End)
+      return false;
+    auto R = UncondBranchAddrSet.lower_bound(Start);
+    return R != UncondBranchAddrSet.end() && *R < End;
+  }
+
+  uint64_t getAddressforIndex(uint64_t Index) const {
+    return CodeAddressVec[Index];
+  }
+
+  size_t getCodeAddrVecSize() const { return CodeAddressVec.size(); }
+
+  bool usePseudoProbes() const { return UsePseudoProbes; }
+  bool useFSDiscriminator() const { return UseFSDiscriminator; }
+  bool isKernel() const { return IsKernel; }
+
+  static bool isKernelImageName(StringRef BinaryName) {
+    return BinaryName == "[kernel.kallsyms]" ||
+           BinaryName == "[kernel.kallsyms]_stext" ||
+           BinaryName == "[kernel.kallsyms]_text";
+  }
+
+  // Get the index in CodeAddressVec for the address
+  // As we might get an address which is not the code
+  // here it would round to the next valid code address by
+  // using lower bound operation
+  uint32_t getIndexForAddr(uint64_t Address) const {
+    auto Low = llvm::lower_bound(CodeAddressVec, Address);
+    return Low - CodeAddressVec.begin();
+  }
+
+  uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const {
+    if (FrameAddr == ExternalAddr)
+      return ExternalAddr;
+    auto I = getIndexForAddr(FrameAddr);
+    FrameAddr = I ? getAddressforIndex(I - 1) : 0;
+    if (FrameAddr && addressIsCall(FrameAddr))
+      return FrameAddr;
+    return 0;
+  }
+
+  FuncRange *findFuncRangeForStartAddr(uint64_t Address) {
+    auto I = StartAddrToFuncRangeMap.find(Address);
+    if (I == StartAddrToFuncRangeMap.end())
+      return nullptr;
+    return &I->second;
+  }
+
+  // Binary search the function range which includes the input address.
+  FuncRange *findFuncRange(uint64_t Address) {
+    auto I = StartAddrToFuncRangeMap.upper_bound(Address);
+    if (I == StartAddrToFuncRangeMap.begin())
+      return nullptr;
+    I--;
+
+    if (Address >= I->second.EndAddress)
+      return nullptr;
+
+    return &I->second;
+  }
+
+  // Get all ranges of one function.
+  RangesTy getRanges(uint64_t Address) {
+    auto *FRange = findFuncRange(Address);
+    // Ignore the range which falls into plt section or system lib.
+    if (!FRange)
+      return RangesTy();
+
+    return FRange->Func->Ranges;
+  }
+
+  const std::unordered_map<std::string, BinaryFunction> &
+  getAllBinaryFunctions() {
+    return BinaryFunctions;
+  }
+
+  std::unordered_set<const BinaryFunction *> &getProfiledFunctions() {
+    return ProfiledFunctions;
+  }
+
+  void setProfiledFunctions(std::unordered_set<const BinaryFunction *> &Funcs) {
+    ProfiledFunctions = Funcs;
+  }
+  
+  BinaryFunction *getBinaryFunction(FunctionId FName) {
+    if (FName.isStringRef()) {
+      auto I = BinaryFunctions.find(FName.str());
+      if (I == BinaryFunctions.end())
+        return nullptr;
+      return &I->second;
+    }
+    auto I = HashBinaryFunctions.find(FName.getHashCode());
+    if (I == HashBinaryFunctions.end())
+      return nullptr;
+    return I->second;
+  }
+
+  uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) {
+    return FuncSizeTracker.getFuncSizeForContext(ContextNode);
+  }
+
+  void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context,
+                          SmallVectorImpl<uint64_t> &NewContext);
+
+  // Load the symbols from debug table and populate into symbol list.
+  void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList);
+
+  SampleContextFrameVector
+  getFrameLocationStack(uint64_t Address, bool UseProbeDiscriminator = false) {
+    InstructionPointer IP(this, Address);
+    return symbolize(IP, SymbolizerOpts.UseSymbolTable, UseProbeDiscriminator);
+  }
+
+  const SampleContextFrameVector &
+  getCachedFrameLocationStack(uint64_t Address,
+                              bool UseProbeDiscriminator = false) {
+    auto I = AddressToLocStackMap.emplace(Address, SampleContextFrameVector());
+    if (I.second) {
+      I.first->second = getFrameLocationStack(Address, UseProbeDiscriminator);
+    }
+    return I.first->second;
+  }
+
+  std::optional<SampleContextFrame> getInlineLeafFrameLoc(uint64_t Address) {
+    const auto &Stack = getCachedFrameLocationStack(Address);
+    if (Stack.empty())
+      return {};
+    return Stack.back();
+  }
+
+  void flushSymbolizer() { Symbolizer.reset(); }
+
+  MissingFrameInferrer *getMissingContextInferrer() {
+    return MissingContextInferrer.get();
+  }
+
+  // Compare two addresses' inline context
+  bool inlineContextEqual(uint64_t Add1, uint64_t Add2);
+
+  // Get the full context of the current stack with inline context filled in.
+  // It will search the disassembling info stored in AddressToLocStackMap. This
+  // is used as the key of function sample map
+  SampleContextFrameVector
+  getExpandedContext(const SmallVectorImpl<uint64_t> &Stack,
+                     bool &WasLeafInlined);
+  // Go through instructions among the given range and record its size for the
+  // inline context.
+  void computeInlinedContextSizeForRange(uint64_t StartAddress,
+                                         uint64_t EndAddress);
+
+  void computeInlinedContextSizeForFunc(const BinaryFunction *Func);
+
+  const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const {
+    return ProbeDecoder.getCallProbeForAddr(Address);
+  }
+
+  void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe,
+                                SampleContextFrameVector &InlineContextStack,
+                                bool IncludeLeaf = false) const {
+    SmallVector<MCPseudoProbeFrameLocation, 16> ProbeInlineContext;
+    ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext,
+                                          IncludeLeaf);
+    for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) {
+      auto &Callsite = ProbeInlineContext[I];
+      // Clear the current context for an unknown probe.
+      if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) {
+        InlineContextStack.clear();
+        continue;
+      }
+      InlineContextStack.emplace_back(FunctionId(Callsite.first),
+                                      LineLocation(Callsite.second, 0));
+    }
+  }
+  const AddressProbesMap &getAddress2ProbesMap() const {
+    return ProbeDecoder.getAddress2ProbesMap();
+  }
+  const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) {
+    return ProbeDecoder.getFuncDescForGUID(GUID);
+  }
+
+  const MCPseudoProbeFuncDesc *
+  getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) {
+    return ProbeDecoder.getInlinerDescForProbe(Probe);
+  }
+
+  bool getTrackFuncContextSize() { return TrackFuncContextSize; }
+
+  bool getIsLoadedByMMap() { return IsLoadedByMMap; }
+
+  void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; }
+
+  bool getMissingMMapWarned() { return MissingMMapWarned; }
+
+  void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; }
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/ldc-profgen/ldc-profgen-21.1/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-21.1/llvm-profgen.cpp
new file mode 100644
index 0000000000..3b974e2510
--- /dev/null
+++ b/tools/ldc-profgen/ldc-profgen-21.1/llvm-profgen.cpp
@@ -0,0 +1,193 @@
+//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-profgen generates SPGO profiles from perf script ouput.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ErrorHandling.h"
+#include "PerfReader.h"
+#include "ProfileGenerator.h"
+#include "ProfiledBinary.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+static cl::OptionCategory ProfGenCategory("ProfGen Options");
+
+static cl::opt<std::string> PerfScriptFilename(
+    "perfscript", cl::value_desc("perfscript"),
+    cl::desc("Path of perf-script trace created by Linux perf tool with "
+             "`script` command(the raw perf.data should be profiled with -b)"),
+    cl::cat(ProfGenCategory));
+static cl::alias PSA("ps", cl::desc("Alias for --perfscript"),
+                     cl::aliasopt(PerfScriptFilename));
+
+static cl::opt<std::string> PerfDataFilename(
+    "perfdata", cl::value_desc("perfdata"),
+    cl::desc("Path of raw perf data created by Linux perf tool (it should be "
+             "profiled with -b)"),
+    cl::cat(ProfGenCategory));
+static cl::alias PDA("pd", cl::desc("Alias for --perfdata"),
+                     cl::aliasopt(PerfDataFilename));
+
+static cl::opt<std::string> UnsymbolizedProfFilename(
+    "unsymbolized-profile", cl::value_desc("unsymbolized profile"),
+    cl::desc("Path of the unsymbolized profile created by "
+             "`llvm-profgen` with `--skip-symbolization`"),
+    cl::cat(ProfGenCategory));
+static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"),
+                     cl::aliasopt(UnsymbolizedProfFilename));
+
+static cl::opt<std::string> SampleProfFilename(
+    "llvm-sample-profile", cl::value_desc("llvm sample profile"),
+    cl::desc("Path of the LLVM sample profile"), cl::cat(ProfGenCategory));
+
+static cl::opt<std::string>
+    BinaryPath("binary", cl::value_desc("binary"), cl::Required,
+               cl::desc("Path of profiled executable binary."),
+               cl::cat(ProfGenCategory));
+
+static cl::opt<uint32_t>
+    ProcessId("pid", cl::value_desc("process Id"), cl::init(0),
+              cl::desc("Process Id for the profiled executable binary."),
+              cl::cat(ProfGenCategory));
+
+static cl::opt<std::string> DebugBinPath(
+    "debug-binary", cl::value_desc("debug-binary"),
+    cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info "
+             "from it instead of the executable binary."),
+    cl::cat(ProfGenCategory));
+
+extern cl::opt<bool> ShowDisassemblyOnly;
+extern cl::opt<bool> ShowSourceLocations;
+extern cl::opt<bool> SkipSymbolization;
+
+using namespace llvm;
+using namespace sampleprof;
+
+// Validate the command line input.
+static void validateCommandLine() {
+  // Allow the missing perfscript if we only use to show binary disassembly.
+  if (!ShowDisassemblyOnly) {
+    // Validate input profile is provided only once
+    bool HasPerfData = PerfDataFilename.getNumOccurrences() > 0;
+    bool HasPerfScript = PerfScriptFilename.getNumOccurrences() > 0;
+    bool HasUnsymbolizedProfile =
+        UnsymbolizedProfFilename.getNumOccurrences() > 0;
+    bool HasSampleProfile = SampleProfFilename.getNumOccurrences() > 0;
+    uint16_t S =
+        HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile;
+    if (S != 1) {
+      std::string Msg =
+          S > 1
+              ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` "
+                "cannot be used together."
+              : "Perf input file is missing, please use one of `--perfscript`, "
+                "`--perfdata` and `--unsymbolized-profile` for the input.";
+      exitWithError(Msg);
+    }
+
+    auto CheckFileExists = [](bool H, StringRef File) {
+      if (H && !llvm::sys::fs::exists(File)) {
+        std::string Msg = "Input perf file(" + File.str() + ") doesn't exist.";
+        exitWithError(Msg);
+      }
+    };
+
+    CheckFileExists(HasPerfData, PerfDataFilename);
+    CheckFileExists(HasPerfScript, PerfScriptFilename);
+    CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename);
+    CheckFileExists(HasSampleProfile, SampleProfFilename);
+  }
+
+  if (!llvm::sys::fs::exists(BinaryPath)) {
+    std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist.";
+    exitWithError(Msg);
+  }
+
+  if (CSProfileGenerator::MaxCompressionSize < -1) {
+    exitWithError("Value of --compress-recursion should >= -1");
+  }
+  if (ShowSourceLocations && !ShowDisassemblyOnly) {
+    exitWithError("--show-source-locations should work together with "
+                  "--show-disassembly-only!");
+  }
+}
+
+static PerfInputFile getPerfInputFile() {
+  PerfInputFile File;
+  if (PerfDataFilename.getNumOccurrences()) {
+    File.InputFile = PerfDataFilename;
+    File.Format = PerfFormat::PerfData;
+  } else if (PerfScriptFilename.getNumOccurrences()) {
+    File.InputFile = PerfScriptFilename;
+    File.Format = PerfFormat::PerfScript;
+  } else if (UnsymbolizedProfFilename.getNumOccurrences()) {
+    File.InputFile = UnsymbolizedProfFilename;
+    File.Format = PerfFormat::UnsymbolizedProfile;
+  }
+  return File;
+}
+
+int main(int argc, const char *argv[]) {
+  InitLLVM X(argc, argv);
+
+  // Initialize targets and assembly printers/parsers.
+  InitializeAllTargetInfos();
+  InitializeAllTargetMCs();
+  InitializeAllDisassemblers();
+
+  cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()});
+  cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n");
+  validateCommandLine();
+
+  // Load symbols and disassemble the code of a given binary.
+  std::unique_ptr<ProfiledBinary> Binary =
+      std::make_unique<ProfiledBinary>(BinaryPath, DebugBinPath);
+  if (ShowDisassemblyOnly)
+    return EXIT_SUCCESS;
+
+  if (SampleProfFilename.getNumOccurrences()) {
+    LLVMContext Context;
+    auto FS = vfs::getRealFileSystem();
+    auto ReaderOrErr =
+        SampleProfileReader::create(SampleProfFilename, Context, *FS);
+    std::unique_ptr<sampleprof::SampleProfileReader> Reader =
+        std::move(ReaderOrErr.get());
+    Reader->read();
+    std::unique_ptr<ProfileGeneratorBase> Generator =
+        ProfileGeneratorBase::create(Binary.get(), Reader->getProfiles(),
+                                     Reader->profileIsCS());
+    Generator->generateProfile();
+    Generator->write();
+  } else {
+    std::optional<uint32_t> PIDFilter;
+    if (ProcessId.getNumOccurrences())
+      PIDFilter = ProcessId;
+    PerfInputFile PerfFile = getPerfInputFile();
+    std::unique_ptr<PerfReaderBase> Reader =
+        PerfReaderBase::create(Binary.get(), PerfFile, PIDFilter);
+    // Parse perf events and samples
+    Reader->parsePerfTraces();
+
+    if (SkipSymbolization)
+      return EXIT_SUCCESS;
+
+    std::unique_ptr<ProfileGeneratorBase> Generator =
+        ProfileGeneratorBase::create(Binary.get(), &Reader->getSampleCounters(),
+                                     Reader->profileIsCS());
+    Generator->generateProfile();
+    Generator->write();
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/utils/FileCheck-21.cpp b/utils/FileCheck-21.cpp
new file mode 100644
index 0000000000..9cf3a3164d
--- /dev/null
+++ b/utils/FileCheck-21.cpp
@@ -0,0 +1,879 @@
+//===- FileCheck.cpp - Check that File's Contents match what is expected --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// FileCheck does a line-by line check of a file that validates whether it
+// contains the expected content.  This is useful for regression tests etc.
+//
+// This program exits with an exit status of 2 on error, exit status of 0 if
+// the file matched the expected contents, and exit status of 1 if it did not
+// contain the expected contents.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FileCheck/FileCheck.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cmath>
+#include <map>
+using namespace llvm;
+
+static cl::extrahelp FileCheckOptsEnv(
+    "\nOptions are parsed from the environment variable FILECHECK_OPTS and\n"
+    "from the command line.\n");
+
+static cl::opt<std::string>
+    CheckFilename(cl::Positional, cl::desc("<check-file>"), cl::Optional);
+
+static cl::opt<std::string>
+    InputFilename("input-file", cl::desc("File to check (defaults to stdin)"),
+                  cl::init("-"), cl::value_desc("filename"));
+
+static cl::list<std::string> CheckPrefixes(
+    "check-prefix",
+    cl::desc("Prefix to use from check file (defaults to 'CHECK')"));
+static cl::alias CheckPrefixesAlias(
+    "check-prefixes", cl::aliasopt(CheckPrefixes), cl::CommaSeparated,
+    cl::NotHidden,
+    cl::desc(
+        "Alias for -check-prefix permitting multiple comma separated values"));
+
+static cl::list<std::string> CommentPrefixes(
+    "comment-prefixes", cl::CommaSeparated, cl::Hidden,
+    cl::desc("Comma-separated list of comment prefixes to use from check file\n"
+             "(defaults to 'COM,RUN'). Please avoid using this feature in\n"
+             "LLVM's LIT-based test suites, which should be easier to\n"
+             "maintain if they all follow a consistent comment style. This\n"
+             "feature is meant for non-LIT test suites using FileCheck."));
+
+static cl::opt<bool> NoCanonicalizeWhiteSpace(
+    "strict-whitespace",
+    cl::desc("Do not treat all horizontal whitespace as equivalent"));
+
+static cl::opt<bool> IgnoreCase(
+    "ignore-case",
+    cl::desc("Use case-insensitive matching"));
+
+static cl::list<std::string> ImplicitCheckNot(
+    "implicit-check-not",
+    cl::desc("Add an implicit negative check with this pattern to every\n"
+             "positive check. This can be used to ensure that no instances of\n"
+             "this pattern occur which are not matched by a positive pattern"),
+    cl::value_desc("pattern"));
+
+static cl::list<std::string>
+    GlobalDefines("D", cl::AlwaysPrefix,
+                  cl::desc("Define a variable to be used in capture patterns."),
+                  cl::value_desc("VAR=VALUE"));
+
+static cl::opt<bool> AllowEmptyInput(
+    "allow-empty", cl::init(false),
+    cl::desc("Allow the input file to be empty. This is useful when making\n"
+             "checks that some error message does not occur, for example."));
+
+static cl::opt<bool> AllowUnusedPrefixes(
+    "allow-unused-prefixes",
+    cl::desc("Allow prefixes to be specified but not appear in the test."));
+
+static cl::opt<bool> MatchFullLines(
+    "match-full-lines", cl::init(false),
+    cl::desc("Require all positive matches to cover an entire input line.\n"
+             "Allows leading and trailing whitespace if --strict-whitespace\n"
+             "is not also passed."));
+
+static cl::opt<bool> EnableVarScope(
+    "enable-var-scope", cl::init(false),
+    cl::desc("Enables scope for regex variables. Variables with names that\n"
+             "do not start with '$' will be reset at the beginning of\n"
+             "each CHECK-LABEL block."));
+
+static cl::opt<bool> AllowDeprecatedDagOverlap(
+    "allow-deprecated-dag-overlap", cl::init(false),
+    cl::desc("Enable overlapping among matches in a group of consecutive\n"
+             "CHECK-DAG directives.  This option is deprecated and is only\n"
+             "provided for convenience as old tests are migrated to the new\n"
+             "non-overlapping CHECK-DAG implementation.\n"));
+
+static cl::opt<bool> Verbose(
+    "v",
+    cl::desc("Print directive pattern matches, or add them to the input dump\n"
+             "if enabled.\n"));
+
+static cl::opt<bool> VerboseVerbose(
+    "vv",
+    cl::desc("Print information helpful in diagnosing internal FileCheck\n"
+             "issues, or add it to the input dump if enabled.  Implies\n"
+             "-v.\n"));
+
+// The order of DumpInputValue members affects their precedence, as documented
+// for -dump-input below.
+enum DumpInputValue {
+  DumpInputNever,
+  DumpInputFail,
+  DumpInputAlways,
+  DumpInputHelp
+};
+
+static cl::list<DumpInputValue> DumpInputs(
+    "dump-input",
+    cl::desc("Dump input to stderr, adding annotations representing\n"
+             "currently enabled diagnostics.  When there are multiple\n"
+             "occurrences of this option, the <value> that appears earliest\n"
+             "in the list below has precedence.  The default is 'fail'.\n"),
+    cl::value_desc("mode"),
+    cl::values(clEnumValN(DumpInputHelp, "help", "Explain input dump and quit"),
+               clEnumValN(DumpInputAlways, "always", "Always dump input"),
+               clEnumValN(DumpInputFail, "fail", "Dump input on failure"),
+               clEnumValN(DumpInputNever, "never", "Never dump input")));
+
+// The order of DumpInputFilterValue members affects their precedence, as
+// documented for -dump-input-filter below.
+enum DumpInputFilterValue {
+  DumpInputFilterError,
+  DumpInputFilterAnnotation,
+  DumpInputFilterAnnotationFull,
+  DumpInputFilterAll
+};
+
+static cl::list<DumpInputFilterValue> DumpInputFilters(
+    "dump-input-filter",
+    cl::desc("In the dump requested by -dump-input, print only input lines of\n"
+             "kind <value> plus any context specified by -dump-input-context.\n"
+             "When there are multiple occurrences of this option, the <value>\n"
+             "that appears earliest in the list below has precedence.  The\n"
+             "default is 'error' when -dump-input=fail, and it's 'all' when\n"
+             "-dump-input=always.\n"),
+    cl::values(clEnumValN(DumpInputFilterAll, "all", "All input lines"),
+               clEnumValN(DumpInputFilterAnnotationFull, "annotation-full",
+                          "Input lines with annotations"),
+               clEnumValN(DumpInputFilterAnnotation, "annotation",
+                          "Input lines with starting points of annotations"),
+               clEnumValN(DumpInputFilterError, "error",
+                          "Input lines with starting points of error "
+                          "annotations")));
+
+static cl::list<unsigned> DumpInputContexts(
+    "dump-input-context", cl::value_desc("N"),
+    cl::desc("In the dump requested by -dump-input, print <N> input lines\n"
+             "before and <N> input lines after any lines specified by\n"
+             "-dump-input-filter.  When there are multiple occurrences of\n"
+             "this option, the largest specified <N> has precedence.  The\n"
+             "default is 5.\n"));
+
+typedef cl::list<std::string>::const_iterator prefix_iterator;
+
+
+
+
+
+
+
+static void DumpCommandLine(int argc, char **argv) {
+  errs() << "FileCheck command line: ";
+  for (int I = 0; I < argc; I++)
+    errs() << " " << argv[I];
+  errs() << "\n";
+}
+
+struct MarkerStyle {
+  /// The starting char (before tildes) for marking the line.
+  char Lead;
+  /// What color to use for this annotation.
+  raw_ostream::Colors Color;
+  /// A note to follow the marker, or empty string if none.
+  std::string Note;
+  /// Does this marker indicate inclusion by -dump-input-filter=error?
+  bool FiltersAsError;
+  MarkerStyle() {}
+  MarkerStyle(char Lead, raw_ostream::Colors Color,
+              const std::string &Note = "", bool FiltersAsError = false)
+      : Lead(Lead), Color(Color), Note(Note), FiltersAsError(FiltersAsError) {
+    assert((!FiltersAsError || !Note.empty()) &&
+           "expected error diagnostic to have note");
+  }
+};
+
+static MarkerStyle GetMarker(FileCheckDiag::MatchType MatchTy) {
+  switch (MatchTy) {
+  case FileCheckDiag::MatchFoundAndExpected:
+    return MarkerStyle('^', raw_ostream::GREEN);
+  case FileCheckDiag::MatchFoundButExcluded:
+    return MarkerStyle('!', raw_ostream::RED, "error: no match expected",
+                       /*FiltersAsError=*/true);
+  case FileCheckDiag::MatchFoundButWrongLine:
+    return MarkerStyle('!', raw_ostream::RED, "error: match on wrong line",
+                       /*FiltersAsError=*/true);
+  case FileCheckDiag::MatchFoundButDiscarded:
+    return MarkerStyle('!', raw_ostream::CYAN,
+                       "discard: overlaps earlier match");
+  case FileCheckDiag::MatchFoundErrorNote:
+    // Note should always be overridden within the FileCheckDiag.
+    return MarkerStyle('!', raw_ostream::RED,
+                       "error: unknown error after match",
+                       /*FiltersAsError=*/true);
+  case FileCheckDiag::MatchNoneAndExcluded:
+    return MarkerStyle('X', raw_ostream::GREEN);
+  case FileCheckDiag::MatchNoneButExpected:
+    return MarkerStyle('X', raw_ostream::RED, "error: no match found",
+                       /*FiltersAsError=*/true);
+  case FileCheckDiag::MatchNoneForInvalidPattern:
+    return MarkerStyle('X', raw_ostream::RED,
+                       "error: match failed for invalid pattern",
+                       /*FiltersAsError=*/true);
+  case FileCheckDiag::MatchFuzzy:
+    return MarkerStyle('?', raw_ostream::MAGENTA, "possible intended match",
+                       /*FiltersAsError=*/true);
+  }
+  llvm_unreachable_internal("unexpected match type");
+}
+
+static void DumpInputAnnotationHelp(raw_ostream &OS) {
+  OS << "The following description was requested by -dump-input=help to\n"
+     << "explain the input dump printed by FileCheck.\n"
+     << "\n"
+     << "Related command-line options:\n"
+     << "\n"
+     << "  - -dump-input=<value> enables or disables the input dump\n"
+     << "  - -dump-input-filter=<value> filters the input lines\n"
+     << "  - -dump-input-context=<N> adjusts the context of filtered lines\n"
+     << "  - -v and -vv add more annotations\n"
+     << "  - -color forces colors to be enabled both in the dump and below\n"
+     << "  - -help documents the above options in more detail\n"
+     << "\n"
+     << "These options can also be set via FILECHECK_OPTS.  For example, for\n"
+     << "maximum debugging output on failures:\n"
+     << "\n"
+     << "  $ FILECHECK_OPTS='-dump-input-filter=all -vv -color' ninja check\n"
+     << "\n"
+     << "Input dump annotation format:\n"
+     << "\n";
+
+  // Labels for input lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "L:";
+  OS << "     labels line number L of the input file\n"
+     << "           An extra space is added after each input line to represent"
+     << " the\n"
+     << "           newline character\n";
+
+  // Labels for annotation lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L";
+  OS << "    labels the only match result for either (1) a pattern of type T"
+     << " from\n"
+     << "           line L of the check file if L is an integer or (2) the"
+     << " I-th implicit\n"
+     << "           pattern if L is \"imp\" followed by an integer "
+     << "I (index origin one)\n";
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L'N";
+  OS << "  labels the Nth match result for such a pattern\n";
+
+  // Markers on annotation lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "^~~";
+  OS << "    marks good match (reported if -v)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "!~~";
+  OS << "    marks bad match, such as:\n"
+     << "           - CHECK-NEXT on same line as previous match (error)\n"
+     << "           - CHECK-NOT found (error)\n"
+     << "           - CHECK-DAG overlapping match (discarded, reported if "
+     << "-vv)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "X~~";
+  OS << "    marks search range when no match is found, such as:\n"
+     << "           - CHECK-NEXT not found (error)\n"
+     << "           - CHECK-NOT not found (success, reported if -vv)\n"
+     << "           - CHECK-DAG not found after discarded matches (error)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "?";
+  OS << "      marks fuzzy match when no match is found\n";
+
+  // Elided lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "...";
+  OS << "    indicates elided input lines and annotations, as specified by\n"
+     << "           -dump-input-filter and -dump-input-context\n";
+
+  // Colors.
+  OS << "  - colors ";
+  WithColor(OS, raw_ostream::GREEN, true) << "success";
+  OS << ", ";
+  WithColor(OS, raw_ostream::RED, true) << "error";
+  OS << ", ";
+  WithColor(OS, raw_ostream::MAGENTA, true) << "fuzzy match";
+  OS << ", ";
+  WithColor(OS, raw_ostream::CYAN, true, false) << "discarded match";
+  OS << ", ";
+  WithColor(OS, raw_ostream::CYAN, true, true) << "unmatched input";
+  OS << "\n";
+}
+
+/// An annotation for a single input line.
+struct InputAnnotation {
+  /// The index of the match result across all checks
+  unsigned DiagIndex;
+  /// The label for this annotation.
+  std::string Label;
+  /// Is this the initial fragment of a diagnostic that has been broken across
+  /// multiple lines?
+  bool IsFirstLine;
+  /// What input line (one-origin indexing) this annotation marks.  This might
+  /// be different from the starting line of the original diagnostic if
+  /// !IsFirstLine.
+  unsigned InputLine;
+  /// The column range (one-origin indexing, open end) in which to mark the
+  /// input line.  If InputEndCol is UINT_MAX, treat it as the last column
+  /// before the newline.
+  unsigned InputStartCol, InputEndCol;
+  /// The marker to use.
+  MarkerStyle Marker;
+  /// Whether this annotation represents a good match for an expected pattern.
+  bool FoundAndExpectedMatch;
+};
+
+/// Get an abbreviation for the check type.
+static std::string GetCheckTypeAbbreviation(Check::FileCheckType Ty) {
+  switch (Ty) {
+  case Check::CheckPlain:
+    if (Ty.getCount() > 1)
+      return "count";
+    return "check";
+  case Check::CheckNext:
+    return "next";
+  case Check::CheckSame:
+    return "same";
+  case Check::CheckNot:
+    return "not";
+  case Check::CheckDAG:
+    return "dag";
+  case Check::CheckLabel:
+    return "label";
+  case Check::CheckEmpty:
+    return "empty";
+  case Check::CheckComment:
+    return "com";
+  case Check::CheckEOF:
+    return "eof";
+  case Check::CheckBadNot:
+    return "bad-not";
+  case Check::CheckBadCount:
+    return "bad-count";
+  case Check::CheckMisspelled:
+    return "misspelled";
+  case Check::CheckNone:
+    llvm_unreachable("invalid FileCheckType");
+  }
+  llvm_unreachable("unknown FileCheckType");
+}
+
+static void
+BuildInputAnnotations(const SourceMgr &SM, unsigned CheckFileBufferID,
+                      const std::pair<unsigned, unsigned> &ImpPatBufferIDRange,
+                      const std::vector<FileCheckDiag> &Diags,
+                      std::vector<InputAnnotation> &Annotations,
+                      unsigned &LabelWidth) {
+  struct CompareSMLoc {
+    bool operator()(const SMLoc &LHS, const SMLoc &RHS) const {
+      return LHS.getPointer() < RHS.getPointer();
+    }
+  };
+  // How many diagnostics does each pattern have?
+  std::map<SMLoc, unsigned, CompareSMLoc> DiagCountPerPattern;
+  for (const FileCheckDiag &Diag : Diags)
+    ++DiagCountPerPattern[Diag.CheckLoc];
+  // How many diagnostics have we seen so far per pattern?
+  std::map<SMLoc, unsigned, CompareSMLoc> DiagIndexPerPattern;
+  // How many total diagnostics have we seen so far?
+  unsigned DiagIndex = 0;
+  // What's the widest label?
+  LabelWidth = 0;
+  for (auto DiagItr = Diags.begin(), DiagEnd = Diags.end(); DiagItr != DiagEnd;
+       ++DiagItr) {
+    InputAnnotation A;
+    A.DiagIndex = DiagIndex++;
+
+    // Build label, which uniquely identifies this check result.
+    unsigned CheckBufferID = SM.FindBufferContainingLoc(DiagItr->CheckLoc);
+    auto CheckLineAndCol =
+        SM.getLineAndColumn(DiagItr->CheckLoc, CheckBufferID);
+    llvm::raw_string_ostream Label(A.Label);
+    Label << GetCheckTypeAbbreviation(DiagItr->CheckTy) << ":";
+    if (CheckBufferID == CheckFileBufferID)
+      Label << CheckLineAndCol.first;
+    else if (ImpPatBufferIDRange.first <= CheckBufferID &&
+             CheckBufferID < ImpPatBufferIDRange.second)
+      Label << "imp" << (CheckBufferID - ImpPatBufferIDRange.first + 1);
+    else
+      llvm_unreachable("expected diagnostic's check location to be either in "
+                       "the check file or for an implicit pattern");
+    if (DiagCountPerPattern[DiagItr->CheckLoc] > 1)
+      Label << "'" << DiagIndexPerPattern[DiagItr->CheckLoc]++;
+    LabelWidth = std::max((std::string::size_type)LabelWidth, A.Label.size());
+
+    A.Marker = GetMarker(DiagItr->MatchTy);
+    if (!DiagItr->Note.empty()) {
+      A.Marker.Note = DiagItr->Note;
+      // It's less confusing if notes that don't actually have ranges don't have
+      // markers.  For example, a marker for 'with "VAR" equal to "5"' would
+      // seem to indicate where "VAR" matches, but the location we actually have
+      // for the marker simply points to the start of the match/search range for
+      // the full pattern of which the substitution is potentially just one
+      // component.
+      if (DiagItr->InputStartLine == DiagItr->InputEndLine &&
+          DiagItr->InputStartCol == DiagItr->InputEndCol)
+        A.Marker.Lead = ' ';
+    }
+    if (DiagItr->MatchTy == FileCheckDiag::MatchFoundErrorNote) {
+      assert(!DiagItr->Note.empty() &&
+             "expected custom note for MatchFoundErrorNote");
+      A.Marker.Note = "error: " + A.Marker.Note;
+    }
+    A.FoundAndExpectedMatch =
+        DiagItr->MatchTy == FileCheckDiag::MatchFoundAndExpected;
+
+    // Compute the mark location, and break annotation into multiple
+    // annotations if it spans multiple lines.
+    A.IsFirstLine = true;
+    A.InputLine = DiagItr->InputStartLine;
+    A.InputStartCol = DiagItr->InputStartCol;
+    if (DiagItr->InputStartLine == DiagItr->InputEndLine) {
+      // Sometimes ranges are empty in order to indicate a specific point, but
+      // that would mean nothing would be marked, so adjust the range to
+      // include the following character.
+      A.InputEndCol =
+          std::max(DiagItr->InputStartCol + 1, DiagItr->InputEndCol);
+      Annotations.push_back(A);
+    } else {
+      assert(DiagItr->InputStartLine < DiagItr->InputEndLine &&
+             "expected input range not to be inverted");
+      A.InputEndCol = UINT_MAX;
+      Annotations.push_back(A);
+      for (unsigned L = DiagItr->InputStartLine + 1, E = DiagItr->InputEndLine;
+           L <= E; ++L) {
+        // If a range ends before the first column on a line, then it has no
+        // characters on that line, so there's nothing to render.
+        if (DiagItr->InputEndCol == 1 && L == E)
+          break;
+        InputAnnotation B;
+        B.DiagIndex = A.DiagIndex;
+        B.Label = A.Label;
+        B.IsFirstLine = false;
+        B.InputLine = L;
+        B.Marker = A.Marker;
+        B.Marker.Lead = '~';
+        B.Marker.Note = "";
+        B.InputStartCol = 1;
+        if (L != E)
+          B.InputEndCol = UINT_MAX;
+        else
+          B.InputEndCol = DiagItr->InputEndCol;
+        B.FoundAndExpectedMatch = A.FoundAndExpectedMatch;
+        Annotations.push_back(B);
+      }
+    }
+  }
+}
+
+static unsigned FindInputLineInFilter(
+    DumpInputFilterValue DumpInputFilter, unsigned CurInputLine,
+    const std::vector<InputAnnotation>::iterator &AnnotationBeg,
+    const std::vector<InputAnnotation>::iterator &AnnotationEnd) {
+  if (DumpInputFilter == DumpInputFilterAll)
+    return CurInputLine;
+  for (auto AnnotationItr = AnnotationBeg; AnnotationItr != AnnotationEnd;
+       ++AnnotationItr) {
+    switch (DumpInputFilter) {
+    case DumpInputFilterAll:
+      llvm_unreachable("unexpected DumpInputFilterAll");
+      break;
+    case DumpInputFilterAnnotationFull:
+      return AnnotationItr->InputLine;
+    case DumpInputFilterAnnotation:
+      if (AnnotationItr->IsFirstLine)
+        return AnnotationItr->InputLine;
+      break;
+    case DumpInputFilterError:
+      if (AnnotationItr->IsFirstLine && AnnotationItr->Marker.FiltersAsError)
+        return AnnotationItr->InputLine;
+      break;
+    }
+  }
+  return UINT_MAX;
+}
+
+/// To OS, print a vertical ellipsis (right-justified at LabelWidth) if it would
+/// occupy less lines than ElidedLines, but print ElidedLines otherwise.  Either
+/// way, clear ElidedLines.  Thus, if ElidedLines is empty, do nothing.
+static void DumpEllipsisOrElidedLines(raw_ostream &OS, std::string &ElidedLines,
+                                      unsigned LabelWidth) {
+  if (ElidedLines.empty())
+    return;
+  unsigned EllipsisLines = 3;
+  if (EllipsisLines < StringRef(ElidedLines).count('\n')) {
+    for (unsigned i = 0; i < EllipsisLines; ++i) {
+      WithColor(OS, raw_ostream::BLACK, /*Bold=*/true)
+          << right_justify(".", LabelWidth);
+      OS << '\n';
+    }
+  } else
+    OS << ElidedLines;
+  ElidedLines.clear();
+}
+
+static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
+                               DumpInputFilterValue DumpInputFilter,
+                               unsigned DumpInputContext,
+                               StringRef InputFileText,
+                               std::vector<InputAnnotation> &Annotations,
+                               unsigned LabelWidth) {
+  OS << "Input was:\n<<<<<<\n";
+
+  // Sort annotations.
+  llvm::sort(Annotations,
+             [](const InputAnnotation &A, const InputAnnotation &B) {
+               // 1. Sort annotations in the order of the input lines.
+               //
+               // This makes it easier to find relevant annotations while
+               // iterating input lines in the implementation below.  FileCheck
+               // does not always produce diagnostics in the order of input
+               // lines due to, for example, CHECK-DAG and CHECK-NOT.
+               if (A.InputLine != B.InputLine)
+                 return A.InputLine < B.InputLine;
+               // 2. Sort annotations in the temporal order FileCheck produced
+               // their associated diagnostics.
+               //
+               // This sort offers several benefits:
+               //
+               // A. On a single input line, the order of annotations reflects
+               //    the FileCheck logic for processing directives/patterns.
+               //    This can be helpful in understanding cases in which the
+               //    order of the associated directives/patterns in the check
+               //    file or on the command line either (i) does not match the
+               //    temporal order in which FileCheck looks for matches for the
+               //    directives/patterns (due to, for example, CHECK-LABEL,
+               //    CHECK-NOT, or `--implicit-check-not`) or (ii) does match
+               //    that order but does not match the order of those
+               //    diagnostics along an input line (due to, for example,
+               //    CHECK-DAG).
+               //
+               //    On the other hand, because our presentation format presents
+               //    input lines in order, there's no clear way to offer the
+               //    same benefit across input lines.  For consistency, it might
+               //    then seem worthwhile to have annotations on a single line
+               //    also sorted in input order (that is, by input column).
+               //    However, in practice, this appears to be more confusing
+               //    than helpful.  Perhaps it's intuitive to expect annotations
+               //    to be listed in the temporal order in which they were
+               //    produced except in cases the presentation format obviously
+               //    and inherently cannot support it (that is, across input
+               //    lines).
+               //
+               // B. When diagnostics' annotations are split among multiple
+               //    input lines, the user must track them from one input line
+               //    to the next.  One property of the sort chosen here is that
+               //    it facilitates the user in this regard by ensuring the
+               //    following: when comparing any two input lines, a
+               //    diagnostic's annotations are sorted in the same position
+               //    relative to all other diagnostics' annotations.
+               return A.DiagIndex < B.DiagIndex;
+             });
+
+  // Compute the width of the label column.
+  const unsigned char *InputFilePtr = InputFileText.bytes_begin(),
+                      *InputFileEnd = InputFileText.bytes_end();
+  unsigned LineCount = InputFileText.count('\n');
+  if (InputFileEnd[-1] != '\n')
+    ++LineCount;
+  unsigned LineNoWidth = std::log10(LineCount) + 1;
+  // +3 below adds spaces (1) to the left of the (right-aligned) line numbers
+  // on input lines and (2) to the right of the (left-aligned) labels on
+  // annotation lines so that input lines and annotation lines are more
+  // visually distinct.  For example, the spaces on the annotation lines ensure
+  // that input line numbers and check directive line numbers never align
+  // horizontally.  Those line numbers might not even be for the same file.
+  // One space would be enough to achieve that, but more makes it even easier
+  // to see.
+  LabelWidth = std::max(LabelWidth, LineNoWidth) + 3;
+
+  // Print annotated input lines.
+  unsigned PrevLineInFilter = 0; // 0 means none so far
+  unsigned NextLineInFilter = 0; // 0 means uncomputed, UINT_MAX means none
+  std::string ElidedLines;
+  raw_string_ostream ElidedLinesOS(ElidedLines);
+  ColorMode TheColorMode =
+      WithColor(OS).colorsEnabled() ? ColorMode::Enable : ColorMode::Disable;
+  if (TheColorMode == ColorMode::Enable)
+    ElidedLinesOS.enable_colors(true);
+  auto AnnotationItr = Annotations.begin(), AnnotationEnd = Annotations.end();
+  for (unsigned Line = 1;
+       InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd;
+       ++Line) {
+    const unsigned char *InputFileLine = InputFilePtr;
+
+    // Compute the previous and next line included by the filter.
+    if (NextLineInFilter < Line)
+      NextLineInFilter = FindInputLineInFilter(DumpInputFilter, Line,
+                                               AnnotationItr, AnnotationEnd);
+    assert(NextLineInFilter && "expected NextLineInFilter to be computed");
+    if (NextLineInFilter == Line)
+      PrevLineInFilter = Line;
+
+    // Elide this input line and its annotations if it's not within the
+    // context specified by -dump-input-context of an input line included by
+    // -dump-input-filter.  However, in case the resulting ellipsis would occupy
+    // more lines than the input lines and annotations it elides, buffer the
+    // elided lines and annotations so we can print them instead.
+    raw_ostream *LineOS;
+    if ((!PrevLineInFilter || PrevLineInFilter + DumpInputContext < Line) &&
+        (NextLineInFilter == UINT_MAX ||
+         Line + DumpInputContext < NextLineInFilter))
+      LineOS = &ElidedLinesOS;
+    else {
+      LineOS = &OS;
+      DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth);
+    }
+
+    // Print right-aligned line number.
+    WithColor(*LineOS, raw_ostream::BLACK, /*Bold=*/true, /*BF=*/false,
+              TheColorMode)
+        << format_decimal(Line, LabelWidth) << ": ";
+
+    // For the case where -v and colors are enabled, find the annotations for
+    // good matches for expected patterns in order to highlight everything
+    // else in the line.  There are no such annotations if -v is disabled.
+    std::vector<InputAnnotation> FoundAndExpectedMatches;
+    if (Req.Verbose && TheColorMode == ColorMode::Enable) {
+      for (auto I = AnnotationItr; I != AnnotationEnd && I->InputLine == Line;
+           ++I) {
+        if (I->FoundAndExpectedMatch)
+          FoundAndExpectedMatches.push_back(*I);
+      }
+    }
+
+    // Print numbered line with highlighting where there are no matches for
+    // expected patterns.
+    bool Newline = false;
+    {
+      WithColor COS(*LineOS, raw_ostream::SAVEDCOLOR, /*Bold=*/false,
+                    /*BG=*/false, TheColorMode);
+      bool InMatch = false;
+      if (Req.Verbose)
+        COS.changeColor(raw_ostream::CYAN, true, true);
+      for (unsigned Col = 1; InputFilePtr != InputFileEnd && !Newline; ++Col) {
+        bool WasInMatch = InMatch;
+        InMatch = false;
+        for (const InputAnnotation &M : FoundAndExpectedMatches) {
+          if (M.InputStartCol <= Col && Col < M.InputEndCol) {
+            InMatch = true;
+            break;
+          }
+        }
+        if (!WasInMatch && InMatch)
+          COS.resetColor();
+        else if (WasInMatch && !InMatch)
+          COS.changeColor(raw_ostream::CYAN, true, true);
+        if (*InputFilePtr == '\n') {
+          Newline = true;
+          COS << ' ';
+        } else
+          COS << *InputFilePtr;
+        ++InputFilePtr;
+      }
+    }
+    *LineOS << '\n';
+    unsigned InputLineWidth = InputFilePtr - InputFileLine;
+
+    // Print any annotations.
+    while (AnnotationItr != AnnotationEnd &&
+           AnnotationItr->InputLine == Line) {
+      WithColor COS(*LineOS, AnnotationItr->Marker.Color, /*Bold=*/true,
+                    /*BG=*/false, TheColorMode);
+      // The two spaces below are where the ": " appears on input lines.
+      COS << left_justify(AnnotationItr->Label, LabelWidth) << "  ";
+      unsigned Col;
+      for (Col = 1; Col < AnnotationItr->InputStartCol; ++Col)
+        COS << ' ';
+      COS << AnnotationItr->Marker.Lead;
+      // If InputEndCol=UINT_MAX, stop at InputLineWidth.
+      for (++Col; Col < AnnotationItr->InputEndCol && Col <= InputLineWidth;
+           ++Col)
+        COS << '~';
+      const std::string &Note = AnnotationItr->Marker.Note;
+      if (!Note.empty()) {
+        // Put the note at the end of the input line.  If we were to instead
+        // put the note right after the marker, subsequent annotations for the
+        // same input line might appear to mark this note instead of the input
+        // line.
+        for (; Col <= InputLineWidth; ++Col)
+          COS << ' ';
+        COS << ' ' << Note;
+      }
+      COS << '\n';
+      ++AnnotationItr;
+    }
+  }
+  DumpEllipsisOrElidedLines(OS, ElidedLines, LabelWidth);
+
+  OS << ">>>>>>\n";
+}
+
+int main(int argc, char **argv) {
+  // Enable use of ANSI color codes because FileCheck is using them to
+  // highlight text.
+  llvm::sys::Process::UseANSIEscapeCodes(true);
+
+  InitLLVM X(argc, argv);
+  cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr,
+                              "FILECHECK_OPTS");
+
+  // Select -dump-input* values.  The -help documentation specifies the default
+  // value and which value to choose if an option is specified multiple times.
+  // In the latter case, the general rule of thumb is to choose the value that
+  // provides the most information.
+  DumpInputValue DumpInput =
+      DumpInputs.empty() ? DumpInputFail : *llvm::max_element(DumpInputs);
+  DumpInputFilterValue DumpInputFilter;
+  if (DumpInputFilters.empty())
+    DumpInputFilter = DumpInput == DumpInputAlways ? DumpInputFilterAll
+                                                   : DumpInputFilterError;
+  else
+    DumpInputFilter = *llvm::max_element(DumpInputFilters);
+  unsigned DumpInputContext =
+      DumpInputContexts.empty() ? 5 : *llvm::max_element(DumpInputContexts);
+
+  if (DumpInput == DumpInputHelp) {
+    DumpInputAnnotationHelp(outs());
+    return 0;
+  }
+  if (CheckFilename.empty()) {
+    errs() << "<check-file> not specified\n";
+    return 2;
+  }
+
+  FileCheckRequest Req;
+  append_range(Req.CheckPrefixes, CheckPrefixes);
+
+  append_range(Req.CommentPrefixes, CommentPrefixes);
+
+  append_range(Req.ImplicitCheckNot, ImplicitCheckNot);
+
+  bool GlobalDefineError = false;
+  for (StringRef G : GlobalDefines) {
+    size_t EqIdx = G.find('=');
+    if (EqIdx == std::string::npos) {
+      errs() << "Missing equal sign in command-line definition '-D" << G
+             << "'\n";
+      GlobalDefineError = true;
+      continue;
+    }
+    if (EqIdx == 0) {
+      errs() << "Missing variable name in command-line definition '-D" << G
+             << "'\n";
+      GlobalDefineError = true;
+      continue;
+    }
+    Req.GlobalDefines.push_back(G);
+  }
+  if (GlobalDefineError)
+    return 2;
+
+  Req.AllowEmptyInput = AllowEmptyInput;
+  Req.AllowUnusedPrefixes = AllowUnusedPrefixes;
+  Req.EnableVarScope = EnableVarScope;
+  Req.AllowDeprecatedDagOverlap = AllowDeprecatedDagOverlap;
+  Req.Verbose = Verbose;
+  Req.VerboseVerbose = VerboseVerbose;
+  Req.NoCanonicalizeWhiteSpace = NoCanonicalizeWhiteSpace;
+  Req.MatchFullLines = MatchFullLines;
+  Req.IgnoreCase = IgnoreCase;
+
+  if (VerboseVerbose)
+    Req.Verbose = true;
+
+  FileCheck FC(Req);
+  if (!FC.ValidateCheckPrefixes())
+    return 2;
+
+  SourceMgr SM;
+
+  // Read the expected strings from the check file.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> CheckFileOrErr =
+      MemoryBuffer::getFileOrSTDIN(CheckFilename, /*IsText=*/true);
+  if (std::error_code EC = CheckFileOrErr.getError()) {
+    errs() << "Could not open check file '" << CheckFilename
+           << "': " << EC.message() << '\n';
+    return 2;
+  }
+  MemoryBuffer &CheckFile = *CheckFileOrErr.get();
+
+  SmallString<4096> CheckFileBuffer;
+  StringRef CheckFileText = FC.CanonicalizeFile(CheckFile, CheckFileBuffer);
+
+  unsigned CheckFileBufferID =
+      SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(
+                                CheckFileText, CheckFile.getBufferIdentifier()),
+                            SMLoc());
+
+  std::pair<unsigned, unsigned> ImpPatBufferIDRange;
+  if (FC.readCheckFile(SM, CheckFileText, &ImpPatBufferIDRange))
+    return 2;
+
+  // Open the file to check and add it to SourceMgr.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> InputFileOrErr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true);
+  if (InputFilename == "-")
+    InputFilename = "<stdin>"; // Overwrite for improved diagnostic messages
+  if (std::error_code EC = InputFileOrErr.getError()) {
+    errs() << "Could not open input file '" << InputFilename
+           << "': " << EC.message() << '\n';
+    return 2;
+  }
+  MemoryBuffer &InputFile = *InputFileOrErr.get();
+
+  if (InputFile.getBufferSize() == 0 && !AllowEmptyInput) {
+    errs() << "FileCheck error: '" << InputFilename << "' is empty.\n";
+    DumpCommandLine(argc, argv);
+    return 2;
+  }
+
+  SmallString<4096> InputFileBuffer;
+  StringRef InputFileText = FC.CanonicalizeFile(InputFile, InputFileBuffer);
+
+  SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(
+                            InputFileText, InputFile.getBufferIdentifier()),
+                        SMLoc());
+
+  std::vector<FileCheckDiag> Diags;
+  int ExitCode = FC.checkInput(SM, InputFileText,
+                               DumpInput == DumpInputNever ? nullptr : &Diags)
+                     ? EXIT_SUCCESS
+                     : 1;
+  if (DumpInput == DumpInputAlways ||
+      (ExitCode == 1 && DumpInput == DumpInputFail)) {
+    errs() << "\n"
+           << "Input file: " << InputFilename << "\n"
+           << "Check file: " << CheckFilename << "\n"
+           << "\n"
+           << "-dump-input=help explains the following input dump.\n"
+           << "\n";
+    std::vector<InputAnnotation> Annotations;
+    unsigned LabelWidth;
+    BuildInputAnnotations(SM, CheckFileBufferID, ImpPatBufferIDRange, Diags,
+                          Annotations, LabelWidth);
+    DumpAnnotatedInput(errs(), Req, DumpInputFilter, DumpInputContext,
+                       InputFileText, Annotations, LabelWidth);
+  }
+
+  return ExitCode;
+}

From 4de521a38f460fdf63c7438832bbd06d2be36180 Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Sun, 9 Nov 2025 19:58:47 +0100
Subject: [PATCH 04/10] GC2Stack: Fix logic regression for LLVM 21+

---
 gen/passes/GarbageCollect2Stack.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/gen/passes/GarbageCollect2Stack.cpp b/gen/passes/GarbageCollect2Stack.cpp
index 631e9f100a..34325939d3 100644
--- a/gen/passes/GarbageCollect2Stack.cpp
+++ b/gen/passes/GarbageCollect2Stack.cpp
@@ -746,18 +746,16 @@ bool isSafeToStackAllocate(BasicBlock::iterator Alloc, Value *V,
       for (auto A = B; A != E; ++A) {
         if (A->get() == V) {
 #if LDC_LLVM_VER >= 2100
-          if (CB->paramHasAttr(A - B, llvm::Attribute::AttrKind::Captures)) {
-            return capturesNothing(
-                        CB->getParamAttr(A - B, llvm::Attribute::AttrKind::Captures)
-                           .getCaptureInfo());
-          }
-
+          if (!(CB->paramHasAttr(A - B, llvm::Attribute::AttrKind::Captures) &&
+                capturesNothing(
+                    CB->getParamAttr(A - B, llvm::Attribute::AttrKind::Captures)
+                        .getCaptureInfo()))) {
 #else
           if (!CB->paramHasAttr(A - B, llvm::Attribute::AttrKind::NoCapture)) {
+#endif
             // The parameter is not marked 'nocapture' - captured.
             return false;
           }
-#endif
 
           if (auto call = dyn_cast<CallInst>(static_cast<Instruction *>(CB))) {
             if (call->isTailCall()) {

From 64edb735a39bda8158697404b81a438bad1ccfff Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Sun, 9 Nov 2025 20:04:08 +0100
Subject: [PATCH 05/10] [lit-test: Adapt to cosmetic asm offset changes with
 LLVM 21]

---
 .../dmd_style_asm_with_variable_and_offset_memory_reference.d   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/codegen/dmd_style_asm_with_variable_and_offset_memory_reference.d b/tests/codegen/dmd_style_asm_with_variable_and_offset_memory_reference.d
index d6a6701186..e2468bd94a 100644
--- a/tests/codegen/dmd_style_asm_with_variable_and_offset_memory_reference.d
+++ b/tests/codegen/dmd_style_asm_with_variable_and_offset_memory_reference.d
@@ -26,7 +26,7 @@ uint getHighHalfOfGlobal(ulong value)
 {
     asm
     {
-        // CHECK: movl    ((4+(-8))+_someGlobalVariable)+8, %eax
+        // CHECK: movl    {{\(?\(?4\+\(?-8\)?\)?\+_someGlobalVariable\)?\+8}}, %eax
         mov EAX, dword ptr [someGlobalVariable + 4];
     }
 }

From fe80b5c57bec58032c9fcebc27f4e6f694582bfa Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Sun, 9 Nov 2025 20:41:09 +0100
Subject: [PATCH 06/10] [adapt android-llvm-config.in to LLVM 21]

---
 .../3-build-cross/android-llvm-config.in      | 29 ++-----------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/.github/actions/3-build-cross/android-llvm-config.in b/.github/actions/3-build-cross/android-llvm-config.in
index 8ac719b4b9..6555349156 100644
--- a/.github/actions/3-build-cross/android-llvm-config.in
+++ b/.github/actions/3-build-cross/android-llvm-config.in
@@ -50,33 +50,8 @@ if [ "$has_rtti" != "YES" ]; then CXXFLAGS="$CXXFLAGS -fno-rtti"; fi
 LDFLAGS="-L${prefix}/lib"
 LIBFILE="${prefix}/lib/libLLVM-$version.so"
 
-components="aarch64 aarch64asmparser aarch64codegen aarch64desc aarch64disassembler aarch64info aarch64utils \
-aggressiveinstcombine all all-targets analysis arm armasmparser armcodegen armdesc armdisassembler arminfo armutils \
-asmparser asmprinter binaryformat bitreader bitstreamreader bitwriter cfguard cgdata codegen codegentypes core coroutines coverage \
-debuginfobtf debuginfocodeview debuginfodwarf debuginfogsym debuginfologicalview debuginfomsf debuginfopdb demangle dlltooldriver dwarflinker dwarflinkerclassic dwarflinkerparallel dwp \
-engine executionengine extensions filecheck frontendatomic frontenddriver frontendhlsl frontendoffloading frontendopenacc frontendopenmp fuzzercli fuzzmutate globalisel hipstdpar instcombine \
-instrumentation interfacestub interpreter ipo irprinter irreader jitlink libdriver lineeditor linker lto mc mca mcdisassembler \
-mcjit mcparser mirparser native nativecodegen objcarcopts objcopy object objectyaml option orcdebugging orcjit orcshared orctargetprocess \
-passes profiledata remarks runtimedyld sandboxir scalaropts selectiondag spirv spirvanalysis spirvcodegen spirvdesc spirvinfo support symbolize tablegen target targetparser telemetry textapi \
-textapibinaryreader transformutils vectorize webassembly webassemblyasmparser webassemblycodegen webassemblydesc webassemblydisassembler \
-webassemblyinfo webassemblyutils windowsdriver windowsmanifest x86 x86asmparser x86codegen x86desc x86disassembler x86info \
-x86targetmca xray"
-static_libs="-lLLVMWindowsManifest -lLLVMXRay -lLLVMLibDriver -lLLVMDlltoolDriver -lLLVMTelemetry -lLLVMTextAPIBinaryReader -lLLVMCoverage -lLLVMLineEditor \
--lLLVMX86TargetMCA -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMX86Desc -lLLVMX86Info \
--lLLVMSPIRVCodeGen -lLLVMSPIRVDesc -lLLVMSPIRVInfo -lLLVMSPIRVAnalysis -lLLVMWebAssemblyDisassembler \
--lLLVMWebAssemblyAsmParser -lLLVMWebAssemblyCodeGen -lLLVMWebAssemblyUtils -lLLVMWebAssemblyDesc -lLLVMWebAssemblyInfo -lLLVMARMDisassembler \
--lLLVMARMAsmParser -lLLVMARMCodeGen -lLLVMARMDesc -lLLVMARMUtils -lLLVMARMInfo -lLLVMAArch64Disassembler \
--lLLVMAArch64AsmParser -lLLVMAArch64CodeGen -lLLVMAArch64Desc -lLLVMAArch64Utils -lLLVMAArch64Info -lLLVMOrcDebugging -lLLVMOrcJIT \
--lLLVMWindowsDriver -lLLVMMCJIT -lLLVMJITLink -lLLVMInterpreter -lLLVMExecutionEngine -lLLVMRuntimeDyld -lLLVMOrcTargetProcess -lLLVMOrcShared \
--lLLVMDWP -lLLVMDebugInfoLogicalView -lLLVMDebugInfoGSYM -lLLVMOption -lLLVMObjectYAML -lLLVMObjCopy -lLLVMMCA \
--lLLVMMCDisassembler -lLLVMLTO -lLLVMPasses -lLLVMHipStdPar -lLLVMCFGuard -lLLVMCoroutines -lLLVMipo \
--lLLVMVectorize -lLLVMSandboxIR -lLLVMLinker -lLLVMInstrumentation -lLLVMFrontendOpenMP -lLLVMFrontendOffloading -lLLVMFrontendOpenACC -lLLVMFrontendHLSL -lLLVMFrontendDriver -lLLVMFrontendAtomic -lLLVMExtensions \
--lLLVMDWARFLinkerParallel -lLLVMDWARFLinkerClassic -lLLVMDWARFLinker -lLLVMGlobalISel -lLLVMMIRParser -lLLVMAsmPrinter -lLLVMSelectionDAG \
--lLLVMCodeGen -lLLVMTarget -lLLVMObjCARCOpts -lLLVMCodeGenTypes -lLLVMCGData -lLLVMIRPrinter -lLLVMInterfaceStub -lLLVMFileCheck -lLLVMFuzzMutate \
--lLLVMScalarOpts -lLLVMInstCombine -lLLVMAggressiveInstCombine -lLLVMTransformUtils -lLLVMBitWriter -lLLVMAnalysis \
--lLLVMProfileData -lLLVMSymbolize -lLLVMDebugInfoBTF -lLLVMDebugInfoPDB -lLLVMDebugInfoMSF -lLLVMDebugInfoCodeView -lLLVMDebugInfoDWARF -lLLVMObject -lLLVMTextAPI -lLLVMMCParser -lLLVMIRReader -lLLVMAsmParser -lLLVMMC \
--lLLVMBitReader -lLLVMFuzzerCLI -lLLVMCore -lLLVMRemarks -lLLVMBitstreamReader -lLLVMBinaryFormat -lLLVMTargetParser -lLLVMTableGen -lLLVMSupport \
--lLLVMDemangle"
+components="aarch64 aarch64asmparser aarch64codegen aarch64desc aarch64disassembler aarch64info aarch64utils aggressiveinstcombine all all-targets analysis arm armasmparser armcodegen armdesc armdisassembler arminfo armutils asmparser asmprinter binaryformat bitreader bitstreamreader bitwriter cfguard cgdata codegen codegentypes core coroutines coverage debuginfobtf debuginfocodeview debuginfodwarf debuginfodwarflowlevel debuginfogsym debuginfologicalview debuginfomsf debuginfopdb demangle dlltooldriver dwarfcfichecker dwarflinker dwarflinkerclassic dwarflinkerparallel dwp engine executionengine extensions filecheck frontendatomic frontenddirective frontenddriver frontendhlsl frontendoffloading frontendopenacc frontendopenmp fuzzercli fuzzmutate globalisel hipstdpar instcombine instrumentation interfacestub interpreter ipo irprinter irreader jitlink libdriver lineeditor linker lto mc mca mcdisassembler mcjit mcparser mirparser native nativecodegen objcarcopts objcopy object objectyaml option orcdebugging orcjit orcshared orctargetprocess passes profiledata remarks runtimedyld sandboxir scalaropts selectiondag spirv spirvanalysis spirvcodegen spirvdesc spirvinfo support symbolize tablegen target targetparser telemetry textapi textapibinaryreader transformutils vectorize webassembly webassemblyasmparser webassemblycodegen webassemblydesc webassemblydisassembler webassemblyinfo webassemblyutils windowsdriver windowsmanifest x86 x86asmparser x86codegen x86desc x86disassembler x86info x86targetmca xray"
+static_libs="-lLLVMWindowsManifest -lLLVMXRay -lLLVMLibDriver -lLLVMDlltoolDriver -lLLVMTelemetry -lLLVMTextAPIBinaryReader -lLLVMCoverage -lLLVMLineEditor -lLLVMX86TargetMCA -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMX86Desc -lLLVMX86Info -lLLVMWebAssemblyDisassembler -lLLVMWebAssemblyAsmParser -lLLVMWebAssemblyCodeGen -lLLVMWebAssemblyUtils -lLLVMWebAssemblyDesc -lLLVMWebAssemblyInfo -lLLVMSPIRVCodeGen -lLLVMSPIRVDesc -lLLVMSPIRVInfo -lLLVMSPIRVAnalysis -lLLVMARMDisassembler -lLLVMARMAsmParser -lLLVMARMCodeGen -lLLVMARMDesc -lLLVMARMUtils -lLLVMARMInfo -lLLVMAArch64Disassembler -lLLVMAArch64AsmParser -lLLVMAArch64CodeGen -lLLVMAArch64Desc -lLLVMAArch64Utils -lLLVMAArch64Info -lLLVMOrcDebugging -lLLVMOrcJIT -lLLVMWindowsDriver -lLLVMMCJIT -lLLVMJITLink -lLLVMInterpreter -lLLVMExecutionEngine -lLLVMRuntimeDyld -lLLVMOrcTargetProcess -lLLVMOrcShared -lLLVMDWP -lLLVMDWARFCFIChecker -lLLVMDebugInfoLogicalView -lLLVMOption -lLLVMObjCopy -lLLVMMCA -lLLVMMCDisassembler -lLLVMLTO -lLLVMPasses -lLLVMHipStdPar -lLLVMCFGuard -lLLVMCoroutines -lLLVMipo -lLLVMVectorize -lLLVMSandboxIR -lLLVMLinker -lLLVMFrontendOpenMP -lLLVMFrontendOffloading -lLLVMObjectYAML -lLLVMFrontendOpenACC -lLLVMFrontendHLSL -lLLVMFrontendDriver -lLLVMInstrumentation -lLLVMFrontendDirective -lLLVMFrontendAtomic -lLLVMExtensions -lLLVMDWARFLinkerParallel -lLLVMDWARFLinkerClassic -lLLVMDWARFLinker -lLLVMGlobalISel -lLLVMMIRParser -lLLVMAsmPrinter -lLLVMSelectionDAG -lLLVMCodeGen -lLLVMTarget -lLLVMObjCARCOpts -lLLVMCodeGenTypes -lLLVMCGData -lLLVMIRPrinter -lLLVMInterfaceStub -lLLVMFileCheck -lLLVMFuzzMutate -lLLVMScalarOpts -lLLVMInstCombine -lLLVMAggressiveInstCombine -lLLVMTransformUtils -lLLVMBitWriter -lLLVMAnalysis -lLLVMProfileData -lLLVMSymbolize -lLLVMDebugInfoBTF -lLLVMDebugInfoPDB -lLLVMDebugInfoMSF -lLLVMDebugInfoCodeView -lLLVMDebugInfoGSYM -lLLVMDebugInfoDWARF -lLLVMDebugInfoDWARFLowLevel -lLLVMObject -lLLVMTextAPI -lLLVMMCParser -lLLVMIRReader -lLLVMAsmParser -lLLVMMC -lLLVMBitReader -lLLVMFuzzerCLI -lLLVMCore -lLLVMRemarks -lLLVMBitstreamReader -lLLVMBinaryFormat -lLLVMTargetParser -lLLVMTableGen -lLLVMSupport -lLLVMDemangle"
 shared_libs="-lLLVM-$version"
 libs=$static_libs
 handle_args () {

From 158a8f0bbf4d0e7ee7ca75c0d21d43263b553cfd Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Sun, 9 Nov 2025 21:01:55 +0100
Subject: [PATCH 07/10] lit-tests: Adapt codegen/align.d to LLVM 21 IR changes

---
 tests/codegen/align.d | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/codegen/align.d b/tests/codegen/align.d
index 9e338a3076..eade1e1045 100644
--- a/tests/codegen/align.d
+++ b/tests/codegen/align.d
@@ -21,12 +21,12 @@ Outer passAndReturnOuterByVal(Outer arg) { return arg; }
 // CHECK-SAME: ptr {{noalias sret.*|inreg noalias}} align 32 %.sret_arg
 /* How the arg is passed by value is ABI-specific, but the pointer must be aligned.
  * When the argument is passed as a byte array and copied into a stack alloc, that stack alloca must be aligned. */
-// CHECK: {{(align 32 %arg|%arg = alloca %align.Outer, align 32)}}
+// CHECK: {{(align 32 %arg|%arg = alloca %align.Outer, align 32|call void @llvm.memcpy.* %.sret_arg,.* %arg)}}
 
 Inner passAndReturnInnerByVal(Inner arg) { return arg; }
 // CHECK: define{{.*}} void @{{.*}}_D5align23passAndReturnInnerByValFSQBh5InnerZQl
 // CHECK-SAME: ptr {{noalias sret.*|inreg noalias}} align 32 %.sret_arg
-// CHECK: {{(align 32 %arg|%arg = alloca %align.Inner, align 32)}}
+// CHECK: {{(align 32 %arg|%arg = alloca %align.Inner, align 32|call void @llvm.memcpy.* %.sret_arg,.* %arg)}}
 
 void main() {
   Outer outer;
@@ -61,11 +61,11 @@ void main() {
   // CHECK: call{{.*}} void @{{.*}}_D5align23passAndReturnOuterByValFSQBh5OuterZQl
   // CHECK-SAME: ptr {{noalias sret.*|inreg noalias}} align 32 %.sret_tmp
   // The argument is either passed by aligned (optimizer hint) pointer or as an array of i32/64 and copied into an aligned stack slot inside the callee.
-  // CHECK-SAME: {{(align 32 %|\[[0-9]+ x i..\])}}
+  // CHECK-SAME: {{(align 32 |\[[0-9]+ x i..\])}}
 
   inner = passAndReturnInnerByVal(inner);
   // CHECK: call{{.*}} void @{{.*}}_D5align23passAndReturnInnerByValFSQBh5InnerZQl
   // CHECK-SAME: ptr {{noalias sret.*|inreg noalias}} align 32 %.sret_tmp
   // The argument is either passed by aligned (optimizer hint) pointer or as an array of i32/64 and copied into an aligned stack slot inside the callee.
-  // CHECK-SAME: {{(align 32 %|\[[0-9]+ x i..\])}}
+  // CHECK-SAME: {{(align 32 |\[[0-9]+ x i..\])}}
 }

From 402ce58154891956256847bb4a4bc8cc4ff2397f Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Mon, 10 Nov 2025 22:34:15 +0100
Subject: [PATCH 08/10] Main CI: Switch macOS x86_64 job to Homebrew clang 21
 too

Don't just use Homebrew clang 21 for macOS arm64, but x86_64 too, as
LDC-LLVM uses it consistently too now.

Also bump the macOS x86_64 image from macos-13 to macos-15-intel
(while using macos-15 for macOS arm64).

The Homebrew clang 21 apparently needs 2 tweaks:
* use in combination with the Command Line Tools, not Xcode
* remove the bundled libc++ headers, using the macOS ones instead
  (matching the *linked* libc++)
---
 .github/actions/1-setup/action.yml      | 17 +++++++++++++----
 .github/actions/4d-test-libs/action.yml |  4 ++++
 .github/workflows/main.yml              | 19 ++++++++++++-------
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/.github/actions/1-setup/action.yml b/.github/actions/1-setup/action.yml
index 8f0ee4a8b1..de7275bbf0 100644
--- a/.github/actions/1-setup/action.yml
+++ b/.github/actions/1-setup/action.yml
@@ -49,10 +49,19 @@ runs:
           sudo ln -sf $tool-21 /usr/bin/$tool
           $tool --version
         done
-    - name: 'macOS arm64: Install Homebrew clang 21' # see mimalloc comment in ../3-build-native/action.yml
-      if: runner.os == 'macOS' && inputs.arch == 'arm64'
+    - name: 'macOS: Install Homebrew clang and lld 21' # see mimalloc comment in ../3-build-native/action.yml
+      if: runner.os == 'macOS'
       shell: bash
-      run: brew install llvm@21
+      run: |
+        set -eux
+        brew install llvm@21 lld@21
+        # https://github.com/llvm/llvm-project/issues/155531#issuecomment-3229499205
+        if [[ '${{ inputs.arch }}' == arm64 ]]; then
+          prefix="/opt/homebrew/opt/llvm"
+        else
+          prefix="/usr/local/opt/llvm"
+        fi
+        rm -rf "$prefix/include/c++/v1"
     - name: 'Windows: Install clang v21.1.5 from GitHub'
       if: runner.os == 'Windows'
       shell: bash
@@ -133,7 +142,7 @@ runs:
       run: |
         set -euxo pipefail
         python3 --version
-        if [[ '${{ runner.os }}-${{ inputs.arch }}' == 'macOS-arm64' ]]; then
+        if [[ '${{ runner.os }}' == 'macOS' ]]; then
           brew install lit
         else
           python3 -m pip install --user lit psutil
diff --git a/.github/actions/4d-test-libs/action.yml b/.github/actions/4d-test-libs/action.yml
index 01844f595b..7f4dbd3d0e 100644
--- a/.github/actions/4d-test-libs/action.yml
+++ b/.github/actions/4d-test-libs/action.yml
@@ -33,6 +33,10 @@ runs:
         if [[ '${{ runner.os }}' == macOS ]]; then
           # FIXME: https://github.com/dlang/phobos/issues/10730
           excludes+='|^std.experimental.allocator.building_blocks.allocator_list'
+          if [[ '${{ inputs.arch }}' == x86_64 ]]; then
+            # FIXME: regressed with image bump from macos-13 to macos-15-intel, apparently wrt. getpwnam_r() setting unexpected errno
+            excludes+='|^std.path'
+          fi
         fi
 
         ctest -j$N --output-on-failure -E "$excludes" --timeout 120
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4f1f59043b..d89b1214e3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  LLVM_VERSION: c922a5f9
+  LLVM_VERSION: 0b762af9
 
 jobs:
   build-native:
@@ -71,12 +71,16 @@ jobs:
             with_pgo: true
 
           - job_name: macOS x86_64
-            os: macos-13
+            os: macos-15-intel
             arch: x86_64
-            # https://github.com/ldc-developers/ldc/issues/4462:
-            # When using LTO, we need to explicitly export ~all symbols for plugin support via `ld64 -exported_symbol '__*'`.
-            # Additionally `-w` to suppress resulting linker warnings.
+            # * CMAKE_OSX_SYSROOT: Homebrew clang apparently requires the Command Line Tools instead of Xcode: https://github.com/actions/runner-images/issues/10035#issue-2344536514
+            # * https://github.com/ldc-developers/ldc/issues/4462:
+            #   When using LTO, we need to explicitly export ~all symbols for plugin support via `ld64 -exported_symbol '__*'`.
+            #   Additionally `-w` to suppress resulting linker warnings.
             extra_cmake_flags: >-
+              -DCMAKE_OSX_SYSROOT=/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk
+              -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang
+              -DCMAKE_CXX_COMPILER=/usr/local/opt/llvm/bin/clang++
               -DD_COMPILER_FLAGS="-O -flto=full -defaultlib=phobos2-ldc-lto,druntime-ldc-lto -L-exported_symbol '-L__*' -L-w"
               -DEXTRA_CXXFLAGS=-flto=full
             with_pgo: true
@@ -85,8 +89,9 @@ jobs:
             os: macos-15
             arch: arm64
             extra_cmake_flags: >-
-              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm@21/bin/clang
-              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@21/bin/clang++
+              -DCMAKE_OSX_SYSROOT=/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk
+              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang
+              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++
               -DD_COMPILER_FLAGS="-O -flto=full -defaultlib=phobos2-ldc-lto,druntime-ldc-lto -L-exported_symbol '-L__*' -L-w"
               -DEXTRA_CXXFLAGS=-flto=full
             with_pgo: true

From 50d5dd075b4e4cc0b38f4ef06f8de0d7a9f977a0 Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Tue, 11 Nov 2025 03:33:00 +0100
Subject: [PATCH 09/10] CMake: Prefer LLVM 21

---
 cmake/Modules/FindLLVM.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/FindLLVM.cmake b/cmake/Modules/FindLLVM.cmake
index 40a8fb8ba2..75fa1eee29 100644
--- a/cmake/Modules/FindLLVM.cmake
+++ b/cmake/Modules/FindLLVM.cmake
@@ -32,7 +32,8 @@
 # We also want an user-specified LLVM_ROOT_DIR to take precedence over the
 # system default locations such as /usr/local/bin. Executing find_program()
 # multiples times is the approach recommended in the docs.
-set(llvm_config_names llvm-config-20.1 llvm-config201 llvm-config-20
+set(llvm_config_names llvm-config-21.1 llvm-config211 llvm-config-21
+                      llvm-config-20.1 llvm-config201 llvm-config-20
                       llvm-config-19.1 llvm-config191 llvm-config-19
                       llvm-config-18.1 llvm-config181 llvm-config-18
                       llvm-config-17.0 llvm-config170 llvm-config-17
@@ -48,10 +49,12 @@ if(APPLE)
     # extra fallbacks for MacPorts & Homebrew
     find_program(LLVM_CONFIG
         NAMES ${llvm_config_names}
-        PATHS /opt/local/libexec/llvm-20/bin /opt/local/libexec/llvm-19/bin
+        PATHS /opt/local/libexec/llvm-21/bin
+              /opt/local/libexec/llvm-20/bin /opt/local/libexec/llvm-19/bin
               /opt/local/libexec/llvm-18/bin /opt/local/libexec/llvm-17/bin
               /opt/local/libexec/llvm-16/bin /opt/local/libexec/llvm-15/bin
               /opt/local/libexec/llvm/bin
+              /usr/local/opt/llvm@21/bin
               /usr/local/opt/llvm@20/bin /usr/local/opt/llvm@19/bin
               /usr/local/opt/llvm@18/bin /usr/local/opt/llvm@17/bin
               /usr/local/opt/llvm@16/bin /usr/local/opt/llvm@15/bin

From ed2051487ecf663e1b556b491ae0264b9471cb8b Mon Sep 17 00:00:00 2001
From: Martin Kinkelin <mkinkelin@symmetryinvestments.com>
Date: Tue, 11 Nov 2025 03:29:54 +0100
Subject: [PATCH 10/10] CI: Add vanilla-LLVM 20 job

---
 .github/workflows/supported_llvm_versions.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/supported_llvm_versions.yml b/.github/workflows/supported_llvm_versions.yml
index 4900f14e87..c3fefb813b 100644
--- a/.github/workflows/supported_llvm_versions.yml
+++ b/.github/workflows/supported_llvm_versions.yml
@@ -19,6 +19,11 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          - job_name: Ubuntu 24.04, LLVM 20, latest LDC beta
+            os: ubuntu-24.04
+            host_dc: ldc-beta
+            llvm_version: 20
+            cmake_flags: -DCMAKE_EXE_LINKER_FLAGS=-lcurl  # work around libcurl dependency for ldc-profdata (LLVM apparently built with non-default LLVM_ENABLE_CURL=ON)
           - job_name: Ubuntu 24.04, LLVM 19, latest LDC beta
             os: ubuntu-24.04
             host_dc: ldc-beta
@@ -74,7 +79,7 @@ jobs:
         run: |
           set -eux
           sudo apt-get update
-          sudo apt-get install gdb lld-${{ matrix.llvm_version }} llvm-${{ matrix.llvm_version }}-dev libclang-common-${{ matrix.llvm_version }}-dev
+          sudo apt-get install gdb lld-${{ matrix.llvm_version }} llvm-${{ matrix.llvm_version }}-dev libclang-common-${{ matrix.llvm_version }}-dev ${{ matrix.llvm_version == '20' && 'libcurl4-openssl-dev' || '' }}
 
       - name: 'macOS: Install a more recent GNU make'
         if: runner.os == 'macOS'