@@ -17,6 +17,9 @@ SPDX-License-Identifier: MIT
1717#include " Compiler/IGCPassSupport.h"
1818#include " SynchronizationObjectCoalescing.hpp"
1919#include " visa_igc_common_header.h"
20+ #include " llvm/IR/IRBuilder.h"
21+ #include " llvm/Analysis/CFG.h"
22+ #include " llvm/Analysis/LoopInfo.h"
2023#include < utility>
2124#include < map>
2225
@@ -285,6 +288,9 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
285288 static_cast <SynchronizationCaseMask>(WriteSyncRead | WriteSyncWrite | AtomicSyncRead | AtomicSyncWrite |
286289 WriteSyncAtomic | ReadSyncAtomic | ReadSyncWrite | AtomicSyncAtomic);
287290
291+ // //////////////////////////////////////////////////////////////////////
292+ void CreateSourceValueInst (std::vector<llvm::Instruction *> &pAtomicInstToBeSourced, llvm::Instruction *pFenceInst);
293+
288294 // //////////////////////////////////////////////////////////////////////
289295 void EraseRedundantInst (llvm::Instruction *pInst);
290296
@@ -327,6 +333,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
327333
328334 // //////////////////////////////////////////////////////////////////////
329335 bool IsRequiredForAtomicOperationsOrdering (const llvm::Instruction *pSourceInst,
336+ std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
330337 bool onlyGlobalAtomics = false ) const ;
331338
332339 // //////////////////////////////////////////////////////////////////////
@@ -440,6 +447,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
440447 std::vector<llvm::Instruction *> m_LscMemoryFences;
441448 std::vector<llvm::Instruction *> m_UntypedMemoryFences;
442449 std::vector<llvm::Instruction *> m_ThreadGroupBarriers;
450+ std::unordered_set<llvm::Instruction *> m_SourcedAtomicInstructions;
443451
444452 // this variable holds a mapping from a basic block to its memory instructions ordered by their occurrences in it
445453 // (the initial index of line of this basic block - the number of instructions preceding an instruction it its basic
@@ -466,6 +474,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
466474 InstMaskLookupTableT m_InstMaskLookupTable;
467475
468476 llvm::Function *m_CurrentFunction = nullptr ;
477+ const CodeGenContext *ctx = nullptr ;
469478 bool m_HasIndependentSharedMemoryFenceFunctionality = false ;
470479 bool m_HasTypedMemoryFenceFunctionality = false ;
471480 bool m_HasUrbFenceFunctionality = false ;
@@ -507,7 +516,7 @@ SynchronizationObjectCoalescing::SynchronizationObjectCoalescing() : llvm::Funct
507516// //////////////////////////////////////////////////////////////////////
508517bool SynchronizationObjectCoalescing::runOnFunction (llvm::Function &F) {
509518 m_CurrentFunction = &F;
510- const CodeGenContext * const ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext ();
519+ ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext ();
511520 const ModuleMetaData *const md = ctx->getModuleMetaData ();
512521 m_HasIndependentSharedMemoryFenceFunctionality =
513522 !ctx->platform .hasSLMFence () ||
@@ -538,6 +547,131 @@ bool SynchronizationObjectCoalescing::ProcessFunction() {
538547 return FindRedundancies ();
539548}
540549
550+ // Referenced from MemoryModelPass
551+ static inline PHINode *FindDominatingPhi (DominatorTree &DT, Instruction *def, BasicBlock *postDominator) {
552+ IGC_ASSERT (def->getParent () != postDominator);
553+ IGC_ASSERT (!DT.dominates (def, postDominator));
554+ SmallPtrSet<PHINode *, 8 > seen;
555+ SmallVector<User *, 8 > worklist (def->users ());
556+ while (!worklist.empty ()) {
557+ PHINode *phi = dyn_cast<PHINode>(worklist.pop_back_val ());
558+ if (phi == nullptr || seen.count (phi) > 0 ) {
559+ continue ;
560+ }
561+ if (phi->getParent () == postDominator || DT.dominates (phi, postDominator)) {
562+ return phi;
563+ }
564+ seen.insert (phi);
565+ }
566+ return nullptr ;
567+ }
568+
569+ // //////////////////////////////////////////////////////////////////////
570+ // / @brief Fence Instruction responsible for only ordering of atomic Instructions
571+ // / can be replaced with Source Value Intrinsic which will still maintain
572+ // / the order of Instructions
573+ void SynchronizationObjectCoalescing::CreateSourceValueInst (std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
574+ llvm::Instruction *pFenceInst) {
575+ if (pAtomicInstToBeSourced.size () == 0 ) {
576+ return ;
577+ }
578+ // reversing the list to source the atomic instructions in the order
579+ reverse (pAtomicInstToBeSourced.begin (), pAtomicInstToBeSourced.end ());
580+ Function *funcPtr = GenISAIntrinsic::getDeclaration (pFenceInst->getModule (), GenISAIntrinsic::GenISA_source_value);
581+ BasicBlock *fenceBB = pFenceInst->getParent ();
582+
583+ Function *F = pAtomicInstToBeSourced[0 ]->getFunction ();
584+ DominatorTree DT (*F);
585+ PostDominatorTree PDT (*F);
586+ LoopInfo LI (DT);
587+
588+ std::unordered_set<llvm::Value *> m_SourcedValues;
589+
590+ for (llvm::Instruction *atomicInst : pAtomicInstToBeSourced) {
591+ // Making sure that the Fence Inst is potentially reachable from the atomic Instruction.
592+ if (!isPotentiallyReachable (atomicInst, pFenceInst, nullptr , &DT, &LI)) {
593+ continue ;
594+ }
595+
596+ // In few shaders, the atomic instructions were already sourced before unification. Skip creating one in this case
597+ bool atomicSourced = false ;
598+ for (User *U : atomicInst->users ()) {
599+ if (GenIntrinsicInst *Inst = dyn_cast<GenIntrinsicInst>(U)) {
600+ // TODO: If dominates fail, then either move the source_value to BB that dominates fence and post dominates
601+ // atomic or delete them and let the below code create new ones. Requires further testing.
602+ if (Inst->getIntrinsicID () == GenISAIntrinsic::GenISA_source_value &&
603+ DT.dominates (Inst->getParent (), fenceBB)) {
604+ atomicSourced = true ;
605+ break ;
606+ }
607+ }
608+ }
609+ if (!atomicSourced) {
610+ BasicBlock *atomicBB = atomicInst->getParent ();
611+ BasicBlock *fenceDominator = fenceBB;
612+ Instruction *insertPoint = atomicBB->getTerminator ();
613+ Value *sourceVal = cast<GenIntrinsicInst>(atomicInst);
614+
615+ // TODO: Determining Insert point can be improved which can postpone the source value intrinsic as long as
616+ // possible. Similar analysis is done in FindOptimalInsertPoints() in ApplyCacheControls.cpp
617+
618+ // Check if fence Instruction BB post dominates atomic Instruction BB
619+ // Else find the BB that is a predecessor of fence BB and post dominates atomic BB.
620+ // If we don't find one, then the insert point is near the terminator of atomic BB
621+ while (fenceDominator && fenceDominator != atomicBB) {
622+ if (PDT.dominates (fenceDominator, atomicBB)) {
623+ // If fence instruction is in same BB, then use fence as insert point
624+ // Else use the terminator of fenceDominator as insert point
625+ insertPoint = fenceBB == fenceDominator ? pFenceInst : fenceDominator->getTerminator ();
626+ // It's possible that the atomic instruction does not dominate
627+ // the post-dominator, find a PHI user of the atomic instruction
628+ // that dominates the post-dominator.
629+ if (!DT.dominates (atomicBB, fenceDominator)) {
630+ PHINode *phi = FindDominatingPhi (DT, atomicInst, fenceDominator);
631+ if (phi) {
632+ sourceVal = phi;
633+ } else {
634+ // Fallback to inserting the source value in the basic
635+ // block with the atomic instruction.
636+ insertPoint = atomicBB->getTerminator ();
637+ }
638+ }
639+ break ;
640+ }
641+ fenceDominator = fenceDominator->getSinglePredecessor ();
642+ }
643+ // A Fence can synchronize a phi node which is coming from atomic instructions in different branches.
644+ // Avoid creating a duplicate source value intrinsic in this case.
645+ if (m_SourcedValues.find (sourceVal) == m_SourcedValues.end ()) {
646+ m_SourcedValues.insert (sourceVal);
647+ // If Fence is present in same BB as atomic, then insert at Fence Instruction
648+ if (fenceBB == atomicBB) {
649+ insertPoint = pFenceInst;
650+ }
651+
652+ IRBuilder<> builder (insertPoint);
653+ Type *sourceValType = sourceVal->getType ();
654+
655+ // Source value intrinsic accepts only i32.
656+ if (sourceValType->isIntegerTy ()) {
657+ sourceVal = builder.CreateZExtOrTrunc (sourceVal, builder.getInt32Ty ());
658+ } else if (sourceValType->isFloatingPointTy ()) {
659+ if (sourceValType->isFloatTy ()) {
660+ sourceVal = builder.CreateBitCast (sourceVal, builder.getInt32Ty ());
661+ } else {
662+ sourceVal = builder.CreateFPToUI (sourceVal, builder.getInt32Ty ());
663+ }
664+ } else {
665+ IGC_ASSERT_MESSAGE (0 , " Unexpected type" );
666+ }
667+
668+ builder.CreateCall (funcPtr, {sourceVal});
669+ }
670+ }
671+ m_SourcedAtomicInstructions.insert (atomicInst);
672+ }
673+ }
674+
541675// //////////////////////////////////////////////////////////////////////
542676void SynchronizationObjectCoalescing::EraseRedundantInst (llvm::Instruction *pInst) {
543677 bool isFence = IsFenceOperation (pInst);
@@ -740,7 +874,13 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
740874 }
741875 SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask (pInst);
742876 bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0 ;
743- isObligatory |= IsRequiredForAtomicOperationsOrdering (pInst, true /* onlyGlobalAtomics*/ );
877+
878+ std::vector<llvm::Instruction *> atomicInstToBeSourced;
879+ if (!isObligatory) {
880+ isObligatory =
881+ IsRequiredForAtomicOperationsOrdering (pInst, atomicInstToBeSourced, true /* onlyGlobalAtomics*/ );
882+ }
883+
744884 bool verifyUnsynchronizedInstructions = IsFenceOperation (pInst);
745885 verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
746886
@@ -767,6 +907,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
767907#if _DEBUG
768908 RegisterRedundancyExplanation (pInst, ExplanationEntry::GlobalMemoryRedundancy);
769909#endif // _DEBUG
910+ if (ctx->platform .enableReplaceAtomicFenceWithSourceValue ()) {
911+ CreateSourceValueInst (atomicInstToBeSourced, const_cast <Instruction *>(pInst));
912+ }
770913 EraseRedundantGlobalScope (pInst);
771914 isModified = true ;
772915 SetLocalMemoryInstructionMask ();
@@ -831,7 +974,12 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
831974 GetSynchronizationMaskForAllResources (localForwardMemoryInstructionMask, localBackwardMemoryInstructionMask);
832975 SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask (pInst);
833976 bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0 ;
834- isObligatory |= IsRequiredForAtomicOperationsOrdering (pInst);
977+
978+ std::vector<llvm::Instruction *> atomicInstToBeSourced;
979+ if (!isObligatory) {
980+ isObligatory = IsRequiredForAtomicOperationsOrdering (pInst, atomicInstToBeSourced);
981+ }
982+
835983 bool verifyUnsynchronizedInstructions = IsFenceOperation (pInst);
836984 verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
837985
@@ -847,6 +995,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
847995#if _DEBUG
848996 RegisterRedundancyExplanation (pInst, ExplanationEntry::StrictRedundancy);
849997#endif // _DEBUG
998+ if (ctx->platform .enableReplaceAtomicFenceWithSourceValue ()) {
999+ CreateSourceValueInst (atomicInstToBeSourced, const_cast <Instruction *>(pInst));
1000+ }
8501001 EraseRedundantInst (pInst);
8511002 isModified = true ;
8521003 }
@@ -1731,8 +1882,9 @@ SynchronizationObjectCoalescing::GetUnsynchronizedForwardInstructionMask(const l
17311882// / operations present before the fence (in program order)
17321883// / @param pSourceInst the source synchronization instruction
17331884// / @param onlyGlobalAtomics check only TGM and UGM atomic operations
1734- bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering (const llvm::Instruction *pSourceInst,
1735- bool onlyGlobalAtomics /* = false*/ ) const {
1885+ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering (
1886+ const llvm::Instruction *pSourceInst, std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
1887+ bool onlyGlobalAtomics /* = false*/ ) const {
17361888 if (!IsFenceOperation (pSourceInst)) {
17371889 // Not a fence, nothing to check
17381890 return false ;
@@ -1782,6 +1934,10 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
17821934 {
17831935 isPotentiallyUnsynchronizedAtomic = false ;
17841936 // Lambda that checks if a fence operation synchronizes the atomic operation.
1937+ // This can be improved to detect the users of atomic instruction and end the search for fences once we find the
1938+ // user. This user is essentially same as Source Value Intrinsic, however it can be reordered in visa affecting
1939+ // the execution order of atomic instructions. If we can find a way to treat this user as a special instruction
1940+ // and avoid reordering, we can skip creating new source value instruction.
17851941 std::function<bool (const llvm::Instruction *)> IsBoundaryInst = [this , &atomicPointerMemoryInstructionMask,
17861942 &isPotentiallyUnsynchronizedAtomic,
17871943 pSourceInst](const llvm::Instruction *pInst) {
@@ -1832,6 +1988,11 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18321988 for (llvm::BasicBlock::const_iterator it = ++pSourceInst->getIterator (); it != pSourceInst->getParent ()->end ();
18331989 ++it) {
18341990 const llvm::Instruction *pCurrInst = &(*it);
1991+ // If we encounter an atomic instruction after pSourceInst (Source Fence), then the fence is required to execute
1992+ // pInst (initial atomic) before pCurrInst (current atomic)
1993+ if (IsAtomicOperation (pCurrInst)) {
1994+ break ;
1995+ }
18351996 if (IsFenceOperation (pCurrInst) && IsSubstituteInstruction (pCurrInst, pSourceInst)) {
18361997 substituteFenceFound = true ;
18371998 break ;
@@ -1840,7 +2001,22 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18402001 if (!substituteFenceFound) {
18412002 // Found an atomic operation that requires the source fence
18422003 // instruction for correct memory ordering.
1843- return true ;
2004+
2005+ // If ReplaceAtomicFenceWithSourceValue is true, we can replace this fence with GenISA_source_value.
2006+ // This will source the atomic instruction and still maintains the order of atomic instructions.
2007+ // Else return true marking the fence instruction as Obligatory.
2008+
2009+ if (ctx->platform .enableReplaceAtomicFenceWithSourceValue ()) {
2010+ // If a previous fence was replaced with source value intrinsic, GetVisibleMemoryInstructions will add the
2011+ // same atomic instruction again for the next fence resulting in multiple source value intrinsics but we need
2012+ // it to be sourced only once. Hence we check if it was already sourced previously. Continues to check all
2013+ // valid atomic Instructions to be sourced.
2014+ if (m_SourcedAtomicInstructions.find (const_cast <Instruction *>(pInst)) == m_SourcedAtomicInstructions.end ()) {
2015+ pAtomicInstToBeSourced.push_back (const_cast <Instruction *>(pInst));
2016+ }
2017+ } else {
2018+ return true ;
2019+ }
18442020 }
18452021 }
18462022 }
@@ -1866,6 +2042,9 @@ SynchronizationObjectCoalescing::GetInstructionMask(const std::vector<const llvm
18662042// / and it means that this instruction must be equal or weaker than the evaluated one.
18672043bool SynchronizationObjectCoalescing::IsSubstituteInstruction (const llvm::Instruction *pEvaluatedInst,
18682044 const llvm::Instruction *pReferenceInst) const {
2045+ if (pEvaluatedInst == pReferenceInst) {
2046+ return false ;
2047+ }
18692048 if (IsUntypedMemoryFenceOperation (pEvaluatedInst) && IsUntypedMemoryFenceOperation (pReferenceInst)) {
18702049 const uint32_t commitEnableArg = 0 ;
18712050 const uint32_t L3FlushRWDataArg = 1 ;
@@ -2002,6 +2181,7 @@ void SynchronizationObjectCoalescing::InvalidateMembers() {
20022181 m_OrderedFenceInstructionsInBasicBlockCache.clear ();
20032182 m_OrderedBarrierInstructionsInBasicBlockCache.clear ();
20042183 m_BasicBlockMemoryInstructionMaskCache.clear ();
2184+ m_SourcedAtomicInstructions.clear ();
20052185#if _DEBUG
20062186 m_ExplanationEntries.clear ();
20072187#endif // _DEBUG
0 commit comments