Skip to content

Commit 8e4da30

Browse files
pthamminigcbot
authored andcommitted
Replace Atomic Fence with GenISA_source_value try 3
Replace Atomic Fence with GenISA_source_value try 3
1 parent bcc4c16 commit 8e4da30

File tree

4 files changed

+196
-6
lines changed

4 files changed

+196
-6
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8581,6 +8581,7 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst *inst) {
85818581
}
85828582
case GenISAIntrinsic::GenISA_source_value: {
85838583
m_encoder->Copy(m_currShader->GetNULL(), GetSymbol(inst->getOperand(0)));
8584+
m_encoder->Fence(false, false, false, false, false, false, false, true);
85848585
m_encoder->Push();
85858586
break;
85868587
}

IGC/Compiler/CISACodeGen/Platform.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,5 +1356,9 @@ class CPlatform {
13561356
bool enableLscSamplerRouting() const {
13571357
return isCoreChildOf(IGFX_XE3_CORE);
13581358
}
1359+
1360+
bool enableReplaceAtomicFenceWithSourceValue() const {
1361+
return IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue);
1362+
}
13591363
};
13601364
} // namespace IGC

IGC/Compiler/Optimizer/SynchronizationObjectCoalescing.cpp

Lines changed: 186 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ SPDX-License-Identifier: MIT
1717
#include "Compiler/IGCPassSupport.h"
1818
#include "SynchronizationObjectCoalescing.hpp"
1919
#include "visa_igc_common_header.h"
20+
#include "llvm/IR/IRBuilder.h"
21+
#include "llvm/Analysis/CFG.h"
22+
#include "llvm/Analysis/LoopInfo.h"
2023
#include <utility>
2124
#include <map>
2225

@@ -285,6 +288,9 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
285288
static_cast<SynchronizationCaseMask>(WriteSyncRead | WriteSyncWrite | AtomicSyncRead | AtomicSyncWrite |
286289
WriteSyncAtomic | ReadSyncAtomic | ReadSyncWrite | AtomicSyncAtomic);
287290

291+
////////////////////////////////////////////////////////////////////////
292+
void CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced, llvm::Instruction *pFenceInst);
293+
288294
////////////////////////////////////////////////////////////////////////
289295
void EraseRedundantInst(llvm::Instruction *pInst);
290296

@@ -327,6 +333,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
327333

328334
////////////////////////////////////////////////////////////////////////
329335
bool IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
336+
std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
330337
bool onlyGlobalAtomics = false) const;
331338

332339
////////////////////////////////////////////////////////////////////////
@@ -440,6 +447,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
440447
std::vector<llvm::Instruction *> m_LscMemoryFences;
441448
std::vector<llvm::Instruction *> m_UntypedMemoryFences;
442449
std::vector<llvm::Instruction *> m_ThreadGroupBarriers;
450+
std::unordered_set<llvm::Instruction *> m_SourcedAtomicInstructions;
443451

444452
// this variable holds a mapping from a basic block to its memory instructions ordered by their occurrences in it
445453
// (the initial index of line of this basic block - the number of instructions preceding an instruction it its basic
@@ -466,6 +474,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
466474
InstMaskLookupTableT m_InstMaskLookupTable;
467475

468476
llvm::Function *m_CurrentFunction = nullptr;
477+
const CodeGenContext *ctx = nullptr;
469478
bool m_HasIndependentSharedMemoryFenceFunctionality = false;
470479
bool m_HasTypedMemoryFenceFunctionality = false;
471480
bool m_HasUrbFenceFunctionality = false;
@@ -507,7 +516,7 @@ SynchronizationObjectCoalescing::SynchronizationObjectCoalescing() : llvm::Funct
507516
////////////////////////////////////////////////////////////////////////
508517
bool SynchronizationObjectCoalescing::runOnFunction(llvm::Function &F) {
509518
m_CurrentFunction = &F;
510-
const CodeGenContext *const ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
519+
ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
511520
const ModuleMetaData *const md = ctx->getModuleMetaData();
512521
m_HasIndependentSharedMemoryFenceFunctionality =
513522
!ctx->platform.hasSLMFence() ||
@@ -538,6 +547,131 @@ bool SynchronizationObjectCoalescing::ProcessFunction() {
538547
return FindRedundancies();
539548
}
540549

550+
// Referenced from MemoryModelPass
551+
static inline PHINode *FindDominatingPhi(DominatorTree &DT, Instruction *def, BasicBlock *postDominator) {
552+
IGC_ASSERT(def->getParent() != postDominator);
553+
IGC_ASSERT(!DT.dominates(def, postDominator));
554+
SmallPtrSet<PHINode *, 8> seen;
555+
SmallVector<User *, 8> worklist(def->users());
556+
while (!worklist.empty()) {
557+
PHINode *phi = dyn_cast<PHINode>(worklist.pop_back_val());
558+
if (phi == nullptr || seen.count(phi) > 0) {
559+
continue;
560+
}
561+
if (phi->getParent() == postDominator || DT.dominates(phi, postDominator)) {
562+
return phi;
563+
}
564+
seen.insert(phi);
565+
}
566+
return nullptr;
567+
}
568+
569+
////////////////////////////////////////////////////////////////////////
570+
/// @brief Fence Instruction responsible for only ordering of atomic Instructions
571+
/// can be replaced with Source Value Intrinsic which will still maintain
572+
/// the order of Instructions
573+
void SynchronizationObjectCoalescing::CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
574+
llvm::Instruction *pFenceInst) {
575+
if (pAtomicInstToBeSourced.size() == 0) {
576+
return;
577+
}
578+
// reversing the list to source the atomic instructions in the order
579+
reverse(pAtomicInstToBeSourced.begin(), pAtomicInstToBeSourced.end());
580+
Function *funcPtr = GenISAIntrinsic::getDeclaration(pFenceInst->getModule(), GenISAIntrinsic::GenISA_source_value);
581+
BasicBlock *fenceBB = pFenceInst->getParent();
582+
583+
Function *F = pAtomicInstToBeSourced[0]->getFunction();
584+
DominatorTree DT(*F);
585+
PostDominatorTree PDT(*F);
586+
LoopInfo LI(DT);
587+
588+
std::unordered_set<llvm::Value *> m_SourcedValues;
589+
590+
for (llvm::Instruction *atomicInst : pAtomicInstToBeSourced) {
591+
// Making sure that the Fence Inst is potentially reachable from the atomic Instruction.
592+
if (!isPotentiallyReachable(atomicInst, pFenceInst, nullptr, &DT, &LI)) {
593+
continue;
594+
}
595+
596+
// In few shaders, the atomic instructions were already sourced before unification. Skip creating one in this case
597+
bool atomicSourced = false;
598+
for (User *U : atomicInst->users()) {
599+
if (GenIntrinsicInst *Inst = dyn_cast<GenIntrinsicInst>(U)) {
600+
// TODO: If dominates fail, then either move the source_value to BB that dominates fence and post dominates
601+
// atomic or delete them and let the below code create new ones. Requires further testing.
602+
if (Inst->getIntrinsicID() == GenISAIntrinsic::GenISA_source_value &&
603+
DT.dominates(Inst->getParent(), fenceBB)) {
604+
atomicSourced = true;
605+
break;
606+
}
607+
}
608+
}
609+
if (!atomicSourced) {
610+
BasicBlock *atomicBB = atomicInst->getParent();
611+
BasicBlock *fenceDominator = fenceBB;
612+
Instruction *insertPoint = atomicBB->getTerminator();
613+
Value *sourceVal = cast<GenIntrinsicInst>(atomicInst);
614+
615+
// TODO: Determining Insert point can be improved which can postpone the source value intrinsic as long as
616+
// possible. Similar analysis is done in FindOptimalInsertPoints() in ApplyCacheControls.cpp
617+
618+
// Check if fence Instruction BB post dominates atomic Instruction BB
619+
// Else find the BB that is a predecessor of fence BB and post dominates atomic BB.
620+
// If we don't find one, then the insert point is near the terminator of atomic BB
621+
while (fenceDominator && fenceDominator != atomicBB) {
622+
if (PDT.dominates(fenceDominator, atomicBB)) {
623+
// If fence instruction is in same BB, then use fence as insert point
624+
// Else use the terminator of fenceDominator as insert point
625+
insertPoint = fenceBB == fenceDominator ? pFenceInst : fenceDominator->getTerminator();
626+
// It's possible that the atomic instruction does not dominate
627+
// the post-dominator, find a PHI user of the atomic instruction
628+
// that dominates the post-dominator.
629+
if (!DT.dominates(atomicBB, fenceDominator)) {
630+
PHINode *phi = FindDominatingPhi(DT, atomicInst, fenceDominator);
631+
if (phi) {
632+
sourceVal = phi;
633+
} else {
634+
// Fallback to inserting the source value in the basic
635+
// block with the atomic instruction.
636+
insertPoint = atomicBB->getTerminator();
637+
}
638+
}
639+
break;
640+
}
641+
fenceDominator = fenceDominator->getSinglePredecessor();
642+
}
643+
// A Fence can synchronize a phi node which is coming from atomic instructions in different branches.
644+
// Avoid creating a duplicate source value intrinsic in this case.
645+
if (m_SourcedValues.find(sourceVal) == m_SourcedValues.end()) {
646+
m_SourcedValues.insert(sourceVal);
647+
// If Fence is present in same BB as atomic, then insert at Fence Instruction
648+
if (fenceBB == atomicBB) {
649+
insertPoint = pFenceInst;
650+
}
651+
652+
IRBuilder<> builder(insertPoint);
653+
Type *sourceValType = sourceVal->getType();
654+
655+
// Source value intrinsic accepts only i32.
656+
if (sourceValType->isIntegerTy()) {
657+
sourceVal = builder.CreateZExtOrTrunc(sourceVal, builder.getInt32Ty());
658+
} else if (sourceValType->isFloatingPointTy()) {
659+
if (sourceValType->isFloatTy()) {
660+
sourceVal = builder.CreateBitCast(sourceVal, builder.getInt32Ty());
661+
} else {
662+
sourceVal = builder.CreateFPToUI(sourceVal, builder.getInt32Ty());
663+
}
664+
} else {
665+
IGC_ASSERT_MESSAGE(0, "Unexpected type");
666+
}
667+
668+
builder.CreateCall(funcPtr, {sourceVal});
669+
}
670+
}
671+
m_SourcedAtomicInstructions.insert(atomicInst);
672+
}
673+
}
674+
541675
////////////////////////////////////////////////////////////////////////
542676
void SynchronizationObjectCoalescing::EraseRedundantInst(llvm::Instruction *pInst) {
543677
bool isFence = IsFenceOperation(pInst);
@@ -740,7 +874,13 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
740874
}
741875
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
742876
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
743-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst, true /*onlyGlobalAtomics*/);
877+
878+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
879+
if (!isObligatory) {
880+
isObligatory =
881+
IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced, true /*onlyGlobalAtomics*/);
882+
}
883+
744884
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
745885
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
746886

@@ -767,6 +907,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
767907
#if _DEBUG
768908
RegisterRedundancyExplanation(pInst, ExplanationEntry::GlobalMemoryRedundancy);
769909
#endif // _DEBUG
910+
if (ctx->platform.enableReplaceAtomicFenceWithSourceValue()) {
911+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
912+
}
770913
EraseRedundantGlobalScope(pInst);
771914
isModified = true;
772915
SetLocalMemoryInstructionMask();
@@ -831,7 +974,12 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
831974
GetSynchronizationMaskForAllResources(localForwardMemoryInstructionMask, localBackwardMemoryInstructionMask);
832975
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
833976
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
834-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst);
977+
978+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
979+
if (!isObligatory) {
980+
isObligatory = IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced);
981+
}
982+
835983
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
836984
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
837985

@@ -847,6 +995,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
847995
#if _DEBUG
848996
RegisterRedundancyExplanation(pInst, ExplanationEntry::StrictRedundancy);
849997
#endif // _DEBUG
998+
if (ctx->platform.enableReplaceAtomicFenceWithSourceValue()) {
999+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
1000+
}
8501001
EraseRedundantInst(pInst);
8511002
isModified = true;
8521003
}
@@ -1731,8 +1882,9 @@ SynchronizationObjectCoalescing::GetUnsynchronizedForwardInstructionMask(const l
17311882
/// operations present before the fence (in program order)
17321883
/// @param pSourceInst the source synchronization instruction
17331884
/// @param onlyGlobalAtomics check only TGM and UGM atomic operations
1734-
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
1735-
bool onlyGlobalAtomics /*= false*/) const {
1885+
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(
1886+
const llvm::Instruction *pSourceInst, std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
1887+
bool onlyGlobalAtomics /*= false*/) const {
17361888
if (!IsFenceOperation(pSourceInst)) {
17371889
// Not a fence, nothing to check
17381890
return false;
@@ -1782,6 +1934,10 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
17821934
{
17831935
isPotentiallyUnsynchronizedAtomic = false;
17841936
// Lambda that checks if a fence operation synchronizes the atomic operation.
1937+
// This can be improved to detect the users of atomic instruction and end the search for fences once we find the
1938+
// user. This user is essentially same as Source Value Intrinsic, however it can be reordered in visa affecting
1939+
// the execution order of atomic instructions. If we can find a way to treat this user as a special instruction
1940+
// and avoid reordering, we can skip creating new source value instruction.
17851941
std::function<bool(const llvm::Instruction *)> IsBoundaryInst = [this, &atomicPointerMemoryInstructionMask,
17861942
&isPotentiallyUnsynchronizedAtomic,
17871943
pSourceInst](const llvm::Instruction *pInst) {
@@ -1832,6 +1988,11 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18321988
for (llvm::BasicBlock::const_iterator it = ++pSourceInst->getIterator(); it != pSourceInst->getParent()->end();
18331989
++it) {
18341990
const llvm::Instruction *pCurrInst = &(*it);
1991+
// If we encounter an atomic instruction after pSourceInst (Source Fence), then the fence is required to execute
1992+
// pInst (initial atomic) before pCurrInst (current atomic)
1993+
if (IsAtomicOperation(pCurrInst)) {
1994+
break;
1995+
}
18351996
if (IsFenceOperation(pCurrInst) && IsSubstituteInstruction(pCurrInst, pSourceInst)) {
18361997
substituteFenceFound = true;
18371998
break;
@@ -1840,7 +2001,22 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18402001
if (!substituteFenceFound) {
18412002
// Found an atomic operation that requires the source fence
18422003
// instruction for correct memory ordering.
1843-
return true;
2004+
2005+
// If ReplaceAtomicFenceWithSourceValue is true, we can replace this fence with GenISA_source_value.
2006+
// This will source the atomic instruction and still maintains the order of atomic instructions.
2007+
// Else return true marking the fence instruction as Obligatory.
2008+
2009+
if (ctx->platform.enableReplaceAtomicFenceWithSourceValue()) {
2010+
// If a previous fence was replaced with source value intrinsic, GetVisibleMemoryInstructions will add the
2011+
// same atomic instruction again for the next fence resulting in multiple source value intrinsics but we need
2012+
// it to be sourced only once. Hence we check if it was already sourced previously. Continues to check all
2013+
// valid atomic Instructions to be sourced.
2014+
if (m_SourcedAtomicInstructions.find(const_cast<Instruction *>(pInst)) == m_SourcedAtomicInstructions.end()) {
2015+
pAtomicInstToBeSourced.push_back(const_cast<Instruction *>(pInst));
2016+
}
2017+
} else {
2018+
return true;
2019+
}
18442020
}
18452021
}
18462022
}
@@ -1866,6 +2042,9 @@ SynchronizationObjectCoalescing::GetInstructionMask(const std::vector<const llvm
18662042
/// and it means that this instruction must be equal or weaker than the evaluated one.
18672043
bool SynchronizationObjectCoalescing::IsSubstituteInstruction(const llvm::Instruction *pEvaluatedInst,
18682044
const llvm::Instruction *pReferenceInst) const {
2045+
if (pEvaluatedInst == pReferenceInst) {
2046+
return false;
2047+
}
18692048
if (IsUntypedMemoryFenceOperation(pEvaluatedInst) && IsUntypedMemoryFenceOperation(pReferenceInst)) {
18702049
const uint32_t commitEnableArg = 0;
18712050
const uint32_t L3FlushRWDataArg = 1;
@@ -2002,6 +2181,7 @@ void SynchronizationObjectCoalescing::InvalidateMembers() {
20022181
m_OrderedFenceInstructionsInBasicBlockCache.clear();
20032182
m_OrderedBarrierInstructionsInBasicBlockCache.clear();
20042183
m_BasicBlockMemoryInstructionMaskCache.clear();
2184+
m_SourcedAtomicInstructions.clear();
20052185
#if _DEBUG
20062186
m_ExplanationEntries.clear();
20072187
#endif // _DEBUG

IGC/common/igc_flags.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,11 @@ DECLARE_IGC_REGKEY(
440440
"The mask is casted to IGC::SyncInstMask and informs which synchronization objects should not be coalesced. Note "
441441
"that synchronization objects classified in multiple types are not disabled if any bit describing them is off.",
442442
true)
443+
DECLARE_IGC_REGKEY(
444+
bool, ReplaceAtomicFenceWithSourceValue, false,
445+
"Fences are required to maintain the order of atomic memory instructions. This flag will replace the fence with "
446+
"GenISA_source_value intrinsic which sources the result of atomic operation and still maintains the order.",
447+
true)
443448
DECLARE_IGC_REGKEY(bool, UnrollLoopForCodeSizeOnly, false,
444449
"Only unroll the loop if it can reduce program size/register pressure. Ignore all other threshold "
445450
"setting but still enable PromoteLoopUnrollwithAlloca due to high likelyhood to reduce size.",

0 commit comments

Comments
 (0)