diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 13583e7a44142..e3b350fbac3cb 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -339,10 +339,6 @@ jobs: sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ extra_lit_opts: --param sycl_build_targets="spir;nvidia;amd" - - name: Remove E2E tests before spirv-backend run - if: ${{ inputs.e2e_binaries_spirv_backend_artifact && !cancelled() && steps.build.conclusion == 'success' }} - run: rm -rf build-e2e - - name: Build E2E tests with SPIR-V Backend if: ${{ inputs.e2e_binaries_spirv_backend_artifact && !cancelled() && steps.build.conclusion == 'success' }} uses: ./devops/actions/run-tests/e2e @@ -354,10 +350,6 @@ jobs: sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ extra_lit_opts: --param spirv-backend=True - - name: Remove E2E tests before preview-mode run - if: ${{ inputs.e2e_binaries_preview_artifact && !cancelled() && steps.build.conclusion == 'success' }} - run: rm -rf build-e2e - - name: Build E2E tests in Preview Mode if: ${{ inputs.e2e_binaries_preview_artifact && !cancelled() && steps.build.conclusion == 'success' }} uses: ./devops/actions/run-tests/e2e diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index e09db20b90369..351f7ba3a1897 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -222,23 +222,20 @@ jobs: fail-fast: false matrix: include: - - name: Intel L0 Gen12 GPU + - name: Intel Gen12 GPU runner: '["Windows", "gen12"]' - target_devices: level_zero:gpu - - name: Intel L0 Arc GPU + - name: Intel Arc GPU runner: '["Windows", "arc"]' - target_devices: level_zero:gpu - - name: Intel L0 Battlemage GPU + - name: Intel Battlemage GPU runner: '["Windows", "bmg"]' - target_devices: level_zero:gpu uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} - target_devices: ${{ matrix.target_devices }} + target_devices: level_zero:gpu toolchain_artifact_filename: ${{ needs.build-win.outputs.toolchain_artifact_filename }} cuda-aws-start: diff --git a/.github/workflows/ur-build-hw.yml b/.github/workflows/ur-build-hw.yml index 709945c95fc3e..a50b8cb2d0945 100644 --- a/.github/workflows/ur-build-hw.yml +++ b/.github/workflows/ur-build-hw.yml @@ -126,7 +126,7 @@ jobs: - name: Download DPC++ run: | - wget -O dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz + wget -O dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2025-11-28/sycl_linux.tar.gz mkdir -p dpcpp_compiler tar -xvf dpcpp_compiler.tar.gz -C dpcpp_compiler diff --git a/buildbot/compile.py b/buildbot/compile.py index 055abd6dab739..de5a776f35003 100644 --- a/buildbot/compile.py +++ b/buildbot/compile.py @@ -31,15 +31,18 @@ def do_compile(args): "cmake", "--build", abs_obj_dir, + ] + + if args.verbose: + cmake_cmd.append("--verbose") + + cmake_cmd += [ "--", args.build_target, "-j", str(cpu_count), ] - if args.verbose: - cmake_cmd.append("--verbose") - print("[Cmake Command]: {}".format(" ".join(cmake_cmd))) subprocess.check_call(cmake_cmd, cwd=abs_obj_dir) diff --git a/buildbot/configure.py b/buildbot/configure.py index 95a05b54a0e99..5bddddec67a06 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -21,7 +21,7 @@ def do_configure(args, passthrough_args): if not os.path.isdir(abs_obj_dir): os.makedirs(abs_obj_dir) - llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw" + llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw;compiler-rt" # libdevice build requires a working SYCL toolchain, which is not the case # with macOS target right now. diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h index a4f6ced16fa03..5cbc96a33d173 100644 --- a/clang/include/clang/Sema/SemaSYCL.h +++ b/clang/include/clang/Sema/SemaSYCL.h @@ -65,7 +65,8 @@ class SYCLIntegrationHeader { kind_work_group_memory, kind_dynamic_work_group_memory, kind_dynamic_accessor, - kind_last = kind_dynamic_accessor + kind_struct_with_special_type, // structs that contain special types + kind_last = kind_struct_with_special_type }; public: @@ -118,6 +119,9 @@ class SYCLIntegrationHeader { /// integration header is required. void addHostPipeRegistration() { NeedToEmitHostPipeRegistration = true; } + /// Set the ParentStruct field + void setParentStruct(ParmVarDecl *parent); + private: // Kernel actual parameter descriptor. struct KernelParamDesc { @@ -205,6 +209,20 @@ class SYCLIntegrationHeader { /// Keeps track of whether declaration of __sycl_host_pipe_registration /// type and __sycl_host_pipe_registrar variable are required to emit. bool NeedToEmitHostPipeRegistration = false; + + // For free function kernels, keeps track of the parameter that is currently + // being analyzed if it is a struct that contains special types. + ParmVarDecl *ParentStruct = nullptr; + + // For every struct that contains a special type which is given by + // the ParentStruct field above, record the offset and size of its fields + // at any nesting level. Store the information in the variable below. + llvm::DenseMap>> + OffsetSizeInfo; + // Likewise for the kind of a field i.e accessor, std_layout etc... + llvm::DenseMap> + KindInfo; }; class SYCLIntegrationFooter { @@ -267,6 +285,10 @@ class SemaSYCL : public SemaBase { llvm::DenseSet FreeFunctionDeclarations; + // A map that keeps track of all structs encountered with + // special types inside. Relevant for free function kernels only. + llvm::DenseSet StructsWithSpecialTypes; + public: SemaSYCL(Sema &S); @@ -317,6 +339,13 @@ class SemaSYCL : public SemaBase { SYCLKernelFunctions.insert(FD); } + /// Add ParentStruct to StructsWithSpecialTypes. + void addStructWithSpecialType(const RecordDecl *ParentStruct) { + StructsWithSpecialTypes.insert(ParentStruct); + } + + auto &getStructsWithSpecialType() const { return StructsWithSpecialTypes; } + /// Lazily creates and returns SYCL integration header instance. SYCLIntegrationHeader &getSyclIntegrationHeader() { if (SyclIntHeader == nullptr) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index cef177d76af7b..60ed7518e903e 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1213,6 +1213,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PB.registerPipelineStartEPCallback( [Options](ModulePassManager &MPM, OptimizationLevel Level) { MPM.addPass(InstrProfilingLoweringPass(*Options, false)); + // The profiling pass adds SYCL device globals so we need to run + // the compile-time properties pass to update the metadata. + MPM.addPass(CompileTimePropertiesPass()); }); // TODO: Consider passing the MemoryProfileOutput to the pass builder via diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2b6590f2ede33..5a6baf0e5906e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5423,6 +5423,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fsycl-is-device"); CmdArgs.push_back("-fdeclare-spirv-builtins"); + // Set the atomic profile update flag to increment counters atomically. + CmdArgs.push_back("-fprofile-update=atomic"); + // Set O2 optimization level by default if (!Args.getLastArg(options::OPT_O_Group)) CmdArgs.push_back("-O2"); diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index ca9c68dedcde7..a4db63c4fb52f 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -1371,11 +1371,7 @@ static ArrayRef getUnsupportedOpts() { options::OPT_fno_profile_generate, // -f[no-]profile-generate options::OPT_ftest_coverage, options::OPT_fno_test_coverage, // -f[no-]test-coverage - options::OPT_fcoverage_mapping, - options::OPT_coverage, // --coverage - options::OPT_fno_coverage_mapping, // -f[no-]coverage-mapping - options::OPT_fprofile_instr_generate, - options::OPT_fprofile_instr_generate_EQ, + options::OPT_coverage, // --coverage options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs, // -f[no-]profile-arcs options::OPT_fno_profile_instr_generate, // -f[no-]profile-instr-generate diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 639e3a2d16cc7..41182950b4aa8 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -1779,6 +1779,11 @@ class SyclKernelFieldHandler : public SyclKernelFieldHandlerBase { SemaSYCL &SemaSYCLRef; SyclKernelFieldHandler(SemaSYCL &S) : SemaSYCLRef(S) {} + // Holds the last handled kernel struct parameter that contains a special + // type. Set in the enterStruct functions. Only relevant for free function + // kernels + ParmVarDecl *ParentStruct = nullptr; + // Returns 'true' if the thing we're visiting (Based on the FD/QualType pair) // is an element of an array. FD will always be the array field. When // traversing the array field, Ty will be the type of the array field or the @@ -2189,31 +2194,12 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO manipulate struct depth once special types are supported for free - // function kernels. - // ++StructFieldDepth; return true; } bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *PD, QualType ParamTy) final { - // TODO manipulate struct depth once special types are supported for free - // function kernels. - // --StructFieldDepth; - // TODO We don't yet support special types and therefore structs that - // require decomposition and leaving/entering. Diagnose for better user - // experience. - CXXRecordDecl *RD = ParamTy->getAsCXXRecordDecl(); - if (RD->hasAttr()) { - Diag.Report(PD->getLocation(), - diag::err_bad_kernel_param_type) - << ParamTy; - Diag.Report(PD->getLocation(), - diag::note_free_function_kernel_param_type_not_supported) - << ParamTy; - IsInvalid = true; - } - return isValid(); + return true; } bool enterStruct(const CXXRecordDecl *, const CXXBaseSpecifier &, @@ -2327,8 +2313,6 @@ class SyclKernelDecompMarker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO We don't support special types in free function kernel parameters, - // but track them to diagnose the case properly. CollectionStack.back() = true; return true; } @@ -2598,9 +2582,8 @@ class SyclKernelPointerHandler : public SyclKernelFieldHandler { return true; } - bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, + QualType ParamTy) final { return true; } @@ -2618,9 +2601,8 @@ class SyclKernelPointerHandler : public SyclKernelFieldHandler { return true; } - bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *PD, + QualType ParamTy) final { return true; } @@ -2692,9 +2674,7 @@ class SyclKernelPointerHandler : public SyclKernelFieldHandler { return true; } - bool handleScalarType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleScalarType(ParmVarDecl *PD, QualType ParamTy) final { return true; } @@ -2714,10 +2694,8 @@ class SyclKernelPointerHandler : public SyclKernelFieldHandler { return true; } - bool handleNonDecompStruct(const CXXRecordDecl *, ParmVarDecl *, - QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleNonDecompStruct(const CXXRecordDecl *, ParmVarDecl *PD, + QualType ParamTy) final { return true; } @@ -3019,9 +2997,11 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { return true; } - bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - // ++StructDepth; + bool enterStruct(const CXXRecordDecl *, ParmVarDecl *PD, QualType Ty) final { + ++StructDepth; + StringRef Name = "_arg_struct"; + addParam(Name, Ty); + ParentStruct = Params.back(); return true; } @@ -3031,8 +3011,7 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { } bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - // --StructDepth; + --StructDepth; return true; } @@ -3222,6 +3201,7 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { return ArrayRef(std::begin(Params) + LastParamIndex, std::end(Params)); } + ParmVarDecl *getParentStruct() { return ParentStruct; } }; // This Visitor traverses the AST of the function with @@ -4400,16 +4380,18 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler { class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { SyclKernelDeclCreator &DeclCreator; llvm::SmallVector BodyStmts; + // Keep track of the structs we have encountered on our way to a special type. + // They will be needed to properly generate the __init call. Note that the + // top-level struct parameter is not kept track here because that is done by + // the DeclCreator. + llvm::SmallVector CurrentStructs; FunctionDecl *FreeFunc = nullptr; SourceLocation FreeFunctionSrcLoc; // Free function source location. llvm::SmallVector ArgExprs; - // Creates a DeclRefExpr to the ParmVar that represents the current free - // function parameter. - Expr *createParamReferenceExpr() { - ParmVarDecl *FreeFunctionParameter = - DeclCreator.getParamVarDeclsForCurrentField()[0]; - + // Creates a DeclRefExpr to the ParmVar that represents an arbitrary + // free function parameter + Expr *createParamReferenceExpr(ParmVarDecl *FreeFunctionParameter) { QualType FreeFunctionParamType = FreeFunctionParameter->getOriginalType(); Expr *DRE = SemaSYCLRef.SemaRef.BuildDeclRefExpr( FreeFunctionParameter, FreeFunctionParamType, VK_LValue, @@ -4418,6 +4400,14 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { return DRE; } + // Creates a DeclRefExpr to the ParmVar that represents the current free + // function parameter. + Expr *createParamReferenceExpr() { + ParmVarDecl *FreeFunctionParameter = + DeclCreator.getParamVarDeclsForCurrentField()[0]; + return createParamReferenceExpr(FreeFunctionParameter); + } + // Creates a DeclRefExpr to the ParmVar that represents the current pointer // parameter. Expr *createPointerParamReferenceExpr(QualType PointerTy) { @@ -4564,9 +4554,21 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { DeclCreator.setBody(KernelBody); } - bool handleSyclSpecialType(FieldDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleSyclSpecialType(FieldDecl *FD, QualType FieldTy) final { + // FD represents a special type which is a field of a struct parameter + // passed to a free function kernel Get this struct parameter using + // getParentStruct and build the __init call. Also add the struct to the + // list of special structs needed later by the integration header to + // generate some helper structs for the runtime. + Expr *Base = createParamReferenceExpr(DeclCreator.getParentStruct()); + for (const auto &child : CurrentStructs) { + Base = buildMemberExpr(Base, child); + } + MemberExpr *MemberAccess = buildMemberExpr(Base, FD); + createSpecialMethodCall(FieldTy->getAsCXXRecordDecl(), InitMethodName, + MemberAccess, BodyStmts); + SemaSYCLRef.addStructWithSpecialType( + DeclCreator.getParentStruct()->getType()->getAsCXXRecordDecl()); return true; } @@ -4575,8 +4577,8 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { // typically if this is the case the default constructor will be private and // in such cases we must manually override the access specifier from private // to public just for the duration of this default initialization. - // TODO: Revisit this approach once https://github.com/intel/llvm/issues/16061 - // is closed. + // TODO: Revisit this approach once + // https://github.com/intel/llvm/issues/16061 is closed. bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { // The code produced looks like this in the case of a work group memory // parameter: @@ -4669,11 +4671,7 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { return true; } - bool handleScalarType(FieldDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); - return true; - } + bool handleScalarType(FieldDecl *FD, QualType FieldTy) final { return true; } bool handleScalarType(ParmVarDecl *, QualType) final { Expr *ParamRef = createParamReferenceExpr(); @@ -4693,27 +4691,25 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { return true; } - bool enterStruct(const CXXRecordDecl *, FieldDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool enterStruct(const CXXRecordDecl *RD, FieldDecl *FD, QualType Ty) final { + CurrentStructs.push_back(FD); return true; } - bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool enterStruct(const CXXRecordDecl *RD, ParmVarDecl *PD, + QualType ParamTy) final { return true; } - bool leaveStruct(const CXXRecordDecl *, FieldDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool leaveStruct(const CXXRecordDecl *, FieldDecl *FD, QualType Ty) final { + CurrentStructs.pop_back(); return true; } bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + ArgExprs.push_back(SemaSYCLRef.SemaRef.BuildDeclRefExpr( + DeclCreator.getParentStruct(), DeclCreator.getParentStruct()->getType(), + VK_PRValue, FreeFunctionSrcLoc)); return true; } @@ -4754,6 +4750,11 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { unsupportedFreeFunctionParamType(); return true; } + FieldDecl *getCurrentStruct() { + assert(CurrentStructs.size() && + "Current free function parameter is not inside a structure!"); + return CurrentStructs.back(); + } }; // Kernels are only the unnamed-lambda feature if the feature is enabled, AND @@ -4796,13 +4797,9 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { addParam(ArgTy, Kind, offsetOf(FD, ArgTy)); } - // For free functions we increment the current offset as each parameter is - // added. void addParam(const ParmVarDecl *PD, QualType ParamTy, SYCLIntegrationHeader::kernel_param_kind_t Kind) { addParam(ParamTy, Kind, offsetOf(PD, ParamTy)); - CurOffset += - SemaSYCLRef.getASTContext().getTypeSizeInChars(ParamTy).getQuantity(); } void addParam(QualType ParamTy, @@ -4986,8 +4983,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { } bool handleSimpleArrayType(FieldDecl *FD, QualType FieldTy) final { - // Arrays are always wrapped inside of structs, so just treat it as a simple - // struct. + // Arrays are always wrapped inside of structs, so just treat it as a + // simple struct. addParam(FD, FieldTy, SYCLIntegrationHeader::kind_std_layout); return true; } @@ -5043,9 +5040,9 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { return true; } - bool enterStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool enterStruct(const CXXRecordDecl *, ParmVarDecl *PD, QualType Ty) final { + addParam(PD, Ty, SYCLIntegrationHeader::kind_struct_with_special_type); + Header.setParentStruct(PD); return true; } @@ -5056,8 +5053,7 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { } bool leaveStruct(const CXXRecordDecl *, ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + Header.setParentStruct(nullptr); return true; } @@ -6149,6 +6145,7 @@ static const char *paramKind2Str(KernelParamKind K) { CASE(work_group_memory); CASE(dynamic_work_group_memory); CASE(dynamic_accessor); + CASE(struct_with_special_type); } return ""; @@ -7194,6 +7191,10 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { unsigned ShimCounter = 1; int FreeFunctionCount = 0; + // Structs with special types inside needs some special code generation in the + // header and we keep this visited map to not have duplicates in case several + // free function kernels use the same struct type as parameters. + llvm::DenseMap visitedStructWithSpecialType; for (const KernelDesc &K : KernelDescs) { if (!S.isFreeFunction(K.SyclKernel)) continue; @@ -7279,6 +7280,67 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { FFPrinter.printFreeFunctionDeclaration(K.SyclKernel, ParmListWithNames); } + // Now we handle all structs that contain special types + // inside. Their information is contained in StructsWithSpecialTypes of + // SemaSYCL. + for (ParmVarDecl *Param : K.SyclKernel->parameters()) { + if (!Param->getType()->isStructureType()) + continue; + const RecordDecl *Struct = Param->getType()->getAsRecordDecl(); + QualType type = Param->getType(); + if (!S.getStructsWithSpecialType().count(Struct) || + visitedStructWithSpecialType.count(Struct)) + continue; + + FwdDeclEmitter.Visit(type.getDesugaredType(S.getASTContext())); + + // this is a struct that contains a special type so its neither a + // special type nor a trivially copyable type. We therefore need to + // explicitly communicate to the runtime that this argument should be + // allowed as a free function kernel argument. We do this by defining + // is_struct_with_special_type to be true. This helper struct also + // contains information about the offset, size and parameter + // kind of every field inside the struct at any nesting level + // This facilitates setting the arguments in the runtime. + // We also define is_device_copyable trait to be true for this type to + // allow it being passed in device kernels. + O << "template <>\n"; + O << "struct " + "sycl::is_device_copyable<"; + Policy.SuppressTagKeyword = true; + type.print(O, Policy); + O << ">: std::true_type {};\n"; + + O << "template <>\n"; + O << "struct " + "sycl::ext::oneapi::experimental::detail::" + "is_struct_with_special_type<"; + Policy.SuppressTagKeyword = true; + type.print(O, Policy); + O << "> {\n"; + O << " inline static constexpr bool value = true;\n"; + O << " static constexpr int offsets[] = { "; + for (const auto OffsetSize : OffsetSizeInfo[Param]) { + O << OffsetSize.first << ", "; + } + O << "-1};\n "; + + O << " static constexpr int sizes[] = { "; + for (const auto OffsetSize : OffsetSizeInfo[Param]) { + O << OffsetSize.second << ", "; + } + O << "-1}; \n "; + + O << " static constexpr sycl::detail::kernel_param_kind_t kinds[] = {\n "; + for (const auto Kind : KindInfo[Param]) { + O << " sycl::detail::kernel_param_kind_t::" << paramKind2Str(Kind); + O << ",\n "; + } + O << "sycl::detail::kernel_param_kind_t::kind_invalid }; \n};\n\n "; + + visitedStructWithSpecialType[Struct] = true; + } + Policy.SuppressTagKeyword = false; FFPrinter.printFreeFunctionShim(K.SyclKernel, ShimCounter, ParmList); O << ";\n"; O << "}\n"; @@ -7372,6 +7434,21 @@ void SYCLIntegrationHeader::addParamDesc(kernel_param_kind_t Kind, int Info, PD.Kind = Kind; PD.Info = Info; PD.Offset = Offset; + // If we are adding a free function kernel parameter that is a struct that + // contains a special type, a little more work needs to be done in order to + // help the runtime set the kernel arguments properly. Add the offset, size, + // and Kind information to the integration header for each field inside this + // struct. Also, verify that we are actually adding a field and not the struct + // itself by checking the Kind. + if (ParentStruct && + Kind != kernel_param_kind_t::kind_struct_with_special_type) { + OffsetSizeInfo[ParentStruct].emplace_back(std::make_pair(Offset, Info)); + KindInfo[ParentStruct].emplace_back(Kind); + } +} + +void SYCLIntegrationHeader::setParentStruct(ParmVarDecl *parent) { + ParentStruct = parent; } void SYCLIntegrationHeader::endKernel() { diff --git a/clang/test/CodeGenSYCL/free_function_int_header.cpp b/clang/test/CodeGenSYCL/free_function_int_header.cpp index d589c6150e2a4..b4326f023df54 100644 --- a/clang/test/CodeGenSYCL/free_function_int_header.cpp +++ b/clang/test/CodeGenSYCL/free_function_int_header.cpp @@ -278,6 +278,49 @@ void ff_24(int arg); void ff_24(int arg) { } +// Tests with parameter types that are structs that contain special types inside e.g accessor + +struct AccessorAndLocalAccessor { + sycl::accessor acc; + sycl::local_accessor lacc; +}; + +struct AccessorAndInt { + sycl::accessor acc; + int a; +}; + +struct IntAndAccessor { + int a; + sycl::accessor acc; +}; + +struct SecondLevelAccessor { + AccessorAndInt accAndInt; +}; + +template +struct TemplatedAccessorStruct { + sycl::accessor acc; + sycl::local_accessor lacc; +}; + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] +void ff_25(AccessorAndLocalAccessor arg1) { +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] +void ff_26(AccessorAndLocalAccessor arg1, SecondLevelAccessor arg2) { +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] +void ff_27(IntAndAccessor arg1, AccessorAndInt) { +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] +void ff_28(TemplatedAccessorStruct arg1) { +} + // CHECK: const char* const kernel_names[] = { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piii // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piiii @@ -313,6 +356,11 @@ void ff_24(int arg) { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_217DerivedPS_ // CHECK-NEXT: {{.*}}__sycl_kernel_ff_227DerivedPS_ // CHECK-NEXT: {{.*}}__sycl_kernel_ff_24i" +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_2524AccessorAndLocalAccessor", +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_2624AccessorAndLocalAccessor19SecondLevelAccessor", +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_2714IntAndAccessor14AccessorAndInt", +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_2823TemplatedAccessorStructIiE", + // CHECK-NEXT: {{.*}}__sycl_kernel_ff_23i" // CHECK-NEXT: "" @@ -321,39 +369,39 @@ void ff_24(int arg) { // CHECK: const kernel_param_desc_t kernel_signatures[] = { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piii // CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 8 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, // CHECK: {{.*}}__sycl_kernel_ff_2Piiii // CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 8 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 16 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, // CHECK: {{.*}}__sycl_kernel_ff_3IiEvPT_S0_S0_ // CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 8 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, // CHECK: {{.*}}__sycl_kernel_ff_3IfEvPT_S0_S0_ // CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 8 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, // CHECK: {{.*}}__sycl_kernel_ff_3IdEvPT_S0_S0_ // CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 8, 8 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 8, 16 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 8, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 8, 0 }, // CHECK: //--- _Z18__sycl_kernel_ff_410NoPointers8Pointers3Agg // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 16, 4 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 32, 20 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 16, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 32, 0 }, // CHECK: //--- _Z18__sycl_kernel_ff_6I3Agg7DerivedEvT_T0_i // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 32, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 40, 32 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 72 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 40, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, // CHECK: //--- _Z18__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 48, 0 }, @@ -364,27 +412,27 @@ void ff_24(int arg) { // CHECK: //--- _ZN28__sycl_kernel_free_functions4ff_9EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5tests5ff_10EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5tests2V15ff_11EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN26__sycl_kernel__GLOBAL__N_15ff_12EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5ff_13EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5tests5ff_13EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _Z18__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE // CHECK-NEXT: { kernel_param_kind_t::kind_dynamic_work_group_memory, 8, 0 }, @@ -409,23 +457,23 @@ void ff_24(int arg) { // CHECK: //--- _ZN28__sycl_kernel_free_functions5tests5ff_14EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5ff_15EiPi // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5ff_16E3AggPS0_ // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 32, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 32 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5ff_17E7DerivedPS0_ // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 40, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 40 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _ZN28__sycl_kernel_free_functions5tests5ff_18ENS_3AggEPS1_ // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 8, 0 }, -// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 8 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_pointer, 8, 0 }, // CHECK: //--- _Z19__sycl_kernel_ff_19N14free_functions16KArgWithPtrArrayILi50EEE // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 800, 0 }, @@ -436,6 +484,32 @@ void ff_24(int arg) { // CHECK: //--- _Z19__sycl_kernel_ff_24i // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK: //--- _Z19__sycl_kernel_ff_2524AccessorAndLocalAccessor +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 36, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4064, 12 }, + +// CHECK: //--- _Z19__sycl_kernel_ff_2624AccessorAndLocalAccessor19SecondLevelAccessor +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 36, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4064, 12 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 16, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, + +// CHECK: //--- _Z19__sycl_kernel_ff_2714IntAndAccessor14AccessorAndInt +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 16, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 4 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 16, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 12 }, + +// CHECK: //--- _Z19__sycl_kernel_ff_2823TemplatedAccessorStructIiE +// CHECK-NEXT: { kernel_param_kind_t::kind_struct_with_special_type, 36, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4064, 12 }, + // CHECK: //--- _Z19__sycl_kernel_ff_23i // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 4, 0 }, @@ -1531,18 +1605,147 @@ void ff_24(int arg) { // CHECK-NEXT: static constexpr bool value = true; // CHECK-NEXT: }; +// CHECK: Definition of _Z19__sycl_kernel_ff_2524AccessorAndLocalAccessor as a free function kernel +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: void ff_25(AccessorAndLocalAccessor arg1); +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::is_device_copyable: std::true_type {}; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::ext::oneapi::experimental::detail::is_struct_with_special_type { +// CHECK-NEXT: inline static constexpr bool value = true; +// CHECK-NEXT: static constexpr int offsets[] = { 0, 12, -1}; +// CHECK-NEXT: static constexpr int sizes[] = { 4062, 4064, -1}; +// CHECK-NEXT: static constexpr sycl::detail::kernel_param_kind_t kinds[] = { +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_invalid }; +// CHECK-NEXT: }; + +// CHECK: static constexpr auto __sycl_shim33() { +// CHECK-NEXT: return (void (*)(struct AccessorAndLocalAccessor))ff_25; +// CHECK-NEXT: } + +// CHECK: struct ext::oneapi::experimental::is_kernel<__sycl_shim33()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim33()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; + +// CHECK: Definition of _Z19__sycl_kernel_ff_2624AccessorAndLocalAccessor19SecondLevelAccessor as a free function kernel +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: void ff_26(AccessorAndLocalAccessor arg1, SecondLevelAccessor arg2); +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::is_device_copyable: std::true_type {}; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::ext::oneapi::experimental::detail::is_struct_with_special_type { +// CHECK-NEXT: inline static constexpr bool value = true; +// CHECK-NEXT: static constexpr int offsets[] = { 0, 12, -1}; +// CHECK-NEXT: static constexpr int sizes[] = { 4062, 4, -1}; +// CHECK-NEXT: static constexpr sycl::detail::kernel_param_kind_t kinds[] = { +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_std_layout, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_invalid }; +// CHECK-NEXT: }; + +// CHECK: static constexpr auto __sycl_shim34() { +// CHECK-NEXT: return (void (*)(struct AccessorAndLocalAccessor, struct SecondLevelAccessor))ff_26; +// CHECK-NEXT: } + +// CHECK: struct ext::oneapi::experimental::is_kernel<__sycl_shim34()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim34()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT }; + +// CHECK: Definition of _Z19__sycl_kernel_ff_2714IntAndAccessor14AccessorAndInt as a free function kernel +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: void ff_27(IntAndAccessor arg1, AccessorAndInt ); +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::is_device_copyable: std::true_type {}; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::ext::oneapi::experimental::detail::is_struct_with_special_type { +// CHECK-NEXT: inline static constexpr bool value = true; +// CHECK-NEXT: static constexpr int offsets[] = { 0, 4, -1}; +// CHECK-NEXT: static constexpr int sizes[] = { 4, 4062, -1}; +// CHECK-NEXT: static constexpr sycl::detail::kernel_param_kind_t kinds[] = { +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_std_layout, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_invalid }; +// CHECK-NEXT: }; + +// CHECK: template <> +// CHECK-NEXT: struct sycl::is_device_copyable: std::true_type {}; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::ext::oneapi::experimental::detail::is_struct_with_special_type { +// CHECK-NEXT: inline static constexpr bool value = true; +// CHECK-NEXT: static constexpr int offsets[] = { 0, 12, -1}; +// CHECK-NEXT: static constexpr int sizes[] = { 4062, 4, -1}; +// CHECK-NEXT: static constexpr sycl::detail::kernel_param_kind_t kinds[] = { +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_std_layout, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_invalid }; +// CHECK-NEXT: }; + + + +// CHECK: static constexpr auto __sycl_shim35() { +// CHECK-NEXT: return (void (*)(struct IntAndAccessor, struct AccessorAndInt))ff_27; +// CHECK-NEXT: } + +// CHECK: struct ext::oneapi::experimental::is_kernel<__sycl_shim35()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim35()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; + + +// CHECK: Definition of _Z19__sycl_kernel_ff_2823TemplatedAccessorStructIiE as a free function kernel +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: template struct TemplatedAccessorStruct; +// CHECK: void ff_28(TemplatedAccessorStruct arg1); +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::is_device_copyable>: std::true_type {}; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct sycl::ext::oneapi::experimental::detail::is_struct_with_special_type> { +// CHECK-NEXT: inline static constexpr bool value = true; +// CHECK-NEXT: static constexpr int offsets[] = { 0, 12, -1}; +// CHECK-NEXT: static constexpr int sizes[] = { 4062, 4064, -1}; +// CHECK-NEXT: static constexpr sycl::detail::kernel_param_kind_t kinds[] = { +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT: sycl::detail::kernel_param_kind_t::kind_accessor, +// CHECK-NEXT sycl::detail::kernel_param_kind_t::kind_invalid }; +// CHECK-NEXT: }; + +// CHECK: static constexpr auto __sycl_shim36() { +// CHECK-NEXT: return (void (*)(struct TemplatedAccessorStruct))ff_28; +// CHECK-NEXT: } + +// CHECK: struct ext::oneapi::experimental::is_kernel<__sycl_shim36()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim36()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; + // CHECK: Definition of _Z19__sycl_kernel_ff_23i as a free function kernel // CHECK: Forward declarations of kernel and its argument types: // CHECK: void ff_23(int arg); -// CHECK-NEXT: static constexpr auto __sycl_shim33() { +// CHECK-NEXT: static constexpr auto __sycl_shim37() { // CHECK-NEXT: return (void (*)(int))ff_23; // CHECK-NEXT: } // CHECK: namespace sycl { // CHECK-NEXT: inline namespace _V1 { // CHECK-NEXT: namespace detail { -// CHECK-NEXT: //Free Function Kernel info specialization for shim33 -// CHECK-NEXT: template <> struct FreeFunctionInfoData<__sycl_shim33()> { +// CHECK-NEXT: //Free Function Kernel info specialization for shim37 +// CHECK-NEXT: template <> struct FreeFunctionInfoData<__sycl_shim37()> { // CHECK-NEXT: __SYCL_DLL_LOCAL // CHECK-NEXT: static constexpr unsigned getNumParams() { return 1; } // CHECK-NEXT: __SYCL_DLL_LOCAL @@ -1554,11 +1757,11 @@ void ff_24(int arg) { // CHECK: namespace sycl { // CHECK-NEXT: template <> -// CHECK-NEXT: struct ext::oneapi::experimental::is_kernel<__sycl_shim33()> { +// CHECK-NEXT: struct ext::oneapi::experimental::is_kernel<__sycl_shim37()> { // CHECK-NEXT: static constexpr bool value = true; // CHECK-NEXT: }; // CHECK-NEXT: template <> -// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim33()> { +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim37()> { // CHECK-NEXT: static constexpr bool value = true; // CHECK-NEXT: }; @@ -1570,10 +1773,10 @@ void ff_24(int arg) { // CHECK-NEXT: namespace { // CHECK-NEXT: struct GlobalMapUpdater { // CHECK-NEXT: GlobalMapUpdater() { -// CHECK-NEXT: sycl::detail::free_function_info_map::add(sycl::detail::kernel_names, sycl::detail::kernel_args_sizes, 33); +// CHECK-NEXT: sycl::detail::free_function_info_map::add(sycl::detail::kernel_names, sycl::detail::kernel_args_sizes, 37); // CHECK-NEXT: } // CHECK-NEXT: ~GlobalMapUpdater() { -// CHECK-NEXT: sycl::detail::free_function_info_map::remove(sycl::detail::kernel_names, sycl::detail::kernel_args_sizes, 33); +// CHECK-NEXT: sycl::detail::free_function_info_map::remove(sycl::detail::kernel_names, sycl::detail::kernel_args_sizes, 37); // CHECK-NEXT: } // CHECK-NEXT: }; // CHECK-NEXT: static GlobalMapUpdater updater; diff --git a/clang/test/Driver/sycl-bc-device-libraries.cpp b/clang/test/Driver/sycl-bc-device-libraries.cpp index 34c926dff69da..b3c4744ad1c8f 100644 --- a/clang/test/Driver/sycl-bc-device-libraries.cpp +++ b/clang/test/Driver/sycl-bc-device-libraries.cpp @@ -1,27 +1,31 @@ /// Test that SYCL bitcode device libraries are properly separated for NVIDIA and AMD targets. -/// Check devicelib and libspirv are linked for nvptx. +/// Check devicelib are linked for nvptx. // RUN: %clang -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=nvptx64-nvidia-cuda \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-NVPTX-BC %s // RUN: %clang_cl -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=nvptx64-nvidia-cuda \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-NVPTX-BC %s // CHECK-NVPTX-BC: clang-linker-wrapper -// CHECK-NVPTX-BC-SAME: "--bitcode-library=nvptx64-nvidia-cuda={{.*}}devicelib-nvptx64-nvidia-cuda.bc" "--bitcode-library=nvptx64-nvidia-cuda={{.*}}libspirv-nvptx64-nvidia-cuda.bc" +// CHECK-NVPTX-BC-SAME: "--bitcode-library=nvptx64-nvidia-cuda={{.*}}devicelib-nvptx64-nvidia-cuda.bc" /// Check devicelib is linked for amdgcn. // RUN: %clang -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=amdgcn-amd-amdhsa \ // RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ // RUN: --rocm-path=%S/Inputs/rocm \ // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-AMD-BC %s // RUN: %clang_cl -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=amdgcn-amd-amdhsa \ // RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ // RUN: --rocm-path=%S/Inputs/rocm \ @@ -32,6 +36,7 @@ /// Check linking with multiple targets. // RUN: %clang -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=amdgcn-amd-amdhsa,nvptx64-nvidia-cuda \ // RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ @@ -39,6 +44,7 @@ // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-MULTI-TARGET %s // RUN: %clang_cl -### -fsycl --offload-new-driver \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ // RUN: -fsycl-targets=amdgcn-amd-amdhsa,nvptx64-nvidia-cuda \ // RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ @@ -46,21 +52,26 @@ // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-MULTI-TARGET %s // CHECK-MULTI-TARGET: clang-linker-wrapper -// CHECK-MULTI-TARGET-SAME: "--bitcode-library=amdgcn-amd-amdhsa={{.*}}devicelib-amdgcn-amd-amdhsa.bc" "--bitcode-library=nvptx64-nvidia-cuda={{.*}}devicelib-nvptx64-nvidia-cuda.bc" "--bitcode-library=nvptx64-nvidia-cuda={{.*}}libspirv-nvptx64-nvidia-cuda.bc" +// CHECK-MULTI-TARGET-SAME: "--bitcode-library=amdgcn-amd-amdhsa={{.*}}devicelib-amdgcn-amd-amdhsa.bc" "--bitcode-library=nvptx64-nvidia-cuda={{.*}}devicelib-nvptx64-nvidia-cuda.bc" /// Test --bitcode-library with nvptx dummy libraries. // RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.devicelib.bc -// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.libspirv.bc -// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda --offload-new-driver -c %s -o %t.nvptx.o -nocudalib -// RUN: clang-linker-wrapper --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.libspirv.bc \ +// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.libdummy.bc +// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ +// RUN: --offload-new-driver -c %s -o %t.nvptx.o -nocudalib +// RUN: clang-linker-wrapper --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.libdummy.bc \ // RUN: --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.nvptx.o -o a.out 2>&1 | FileCheck -check-prefix=CHECK-WRAPPER-NVPTX %s -// CHECK-WRAPPER-NVPTX: llvm-link{{.*}} {{.*}}.nvptx.devicelib.bc {{.*}}.nvptx.libspirv.bc +// CHECK-WRAPPER-NVPTX: llvm-link{{.*}} {{.*}}.nvptx.devicelib.bc {{.*}}.nvptx.libdummy.bc /// Test --bitcode-library with amdgcn dummy library. // RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amd.devicelib.bc -// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 --offload-new-driver -c %s -o %t.amd.o -nogpulib +// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa \ +// RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ +// RUN: --offload-new-driver -c %s -o %t.amd.o -nogpulib -fgpu-rdc // RUN: clang-linker-wrapper --bitcode-library=amdgcn-amd-amdhsa=%t.amd.devicelib.bc \ // RUN: --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.amd.o -o a.out 2>&1 | FileCheck -check-prefix=CHECK-WRAPPER-AMD %s @@ -70,10 +81,11 @@ /// Test --bitcode-library with multi-target bc libraries. // RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa,nvptx64-nvidia-cuda \ // RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx900 \ -// RUN: --offload-new-driver -c %s -o %t.multi.o -nocudalib -nogpulib -// RUN: clang-linker-wrapper --bitcode-library=amdgcn-amd-amdhsa=%t.amd.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.libspirv.bc \ +// RUN: -fno-sycl-libspirv -Wno-unsafe-libspirv-not-linked \ +// RUN: --offload-new-driver -c %s -o %t.multi.o -nocudalib -nogpulib -fgpu-rdc +// RUN: clang-linker-wrapper --bitcode-library=amdgcn-amd-amdhsa=%t.amd.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.devicelib.bc --bitcode-library=nvptx64-nvidia-cuda=%t.nvptx.libdummy.bc \ // RUN: --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.multi.o -o a.out 2>&1 | FileCheck -check-prefix=CHECK-WRAPPER-MULTI %s // CHECK-WRAPPER-MULTI: llvm-link{{.*}} {{.*}}.amd.devicelib.bc -// CHECK-WRAPPER-MULTI: llvm-link{{.*}} {{.*}}.nvptx.devicelib.bc {{.*}}.nvptx.libspirv.bc +// CHECK-WRAPPER-MULTI: llvm-link{{.*}} {{.*}}.nvptx.devicelib.bc {{.*}}.nvptx.libdummy.bc diff --git a/clang/test/Driver/sycl-profile-update.cpp b/clang/test/Driver/sycl-profile-update.cpp new file mode 100644 index 0000000000000..2d11cc6378047 --- /dev/null +++ b/clang/test/Driver/sycl-profile-update.cpp @@ -0,0 +1,4 @@ +// Ensure that the profile update mode is set to 'atomic' when compiling SYCL code. +// RUN: %clangxx -### -fsycl -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | FileCheck %s +// RUN: %clang_cl -### -fsycl -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | FileCheck %s +// CHECK: "-fprofile-update=atomic" diff --git a/clang/test/Driver/sycl-unsupported.cpp b/clang/test/Driver/sycl-unsupported.cpp index a2c09f615209a..044c1805307bf 100644 --- a/clang/test/Driver/sycl-unsupported.cpp +++ b/clang/test/Driver/sycl-unsupported.cpp @@ -19,13 +19,6 @@ // RUN: -DOPT_CC1=-debug-info-kind=line-tables-only \ // RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// RUN: %clangxx -fsycl -fprofile-instr-generate -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate \ -// RUN: -DOPT_CC1=-fprofile-instrument=clang \ -// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// RUN: %clangxx -fsycl -fcoverage-mapping \ -// RUN: -fprofile-instr-generate -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fcoverage-mapping // RUN: %clangxx -fsycl -ftest-coverage -### %s 2>&1 \ // RUN: | FileCheck %s -DARCH=spir64 -DOPT=-ftest-coverage \ // RUN: -DOPT_CC1=-coverage-notes-file \ @@ -49,12 +42,6 @@ // RUN: | FileCheck %s -DARCH=spir64 -DOPT=--coverage \ // RUN: -DOPT_CC1=-coverage-notes-file \ // RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// Check to make sure our '-fsanitize=address' exception isn't triggered by a -// different option -// RUN: %clangxx -fsycl -fprofile-instr-generate=address -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate=address \ -// RUN: -DOPT_CC1=-fprofile-instrument=clang \ -// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT // CHECK: ignoring '[[OPT]]' option as it is not currently supported for target '[[ARCH]]{{.*}}'; only supported for host compilation [-Woption-ignored] // CHECK-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "[[OPT]]{{.*}}" diff --git a/clang/test/SemaSYCL/free_function_kernel_params_restrictions.cpp b/clang/test/SemaSYCL/free_function_kernel_params_restrictions.cpp index d1bdc0e3da475..c7b2d2de8921c 100644 --- a/clang/test/SemaSYCL/free_function_kernel_params_restrictions.cpp +++ b/clang/test/SemaSYCL/free_function_kernel_params_restrictions.cpp @@ -42,20 +42,3 @@ __attribute__((sycl_device)) [[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] void ff_5(A S1) { } - - - -struct StructWithAccessor { - sycl::accessor acc; - int *ptr; -}; - -struct Wrapper { - StructWithAccessor SWA; - -}; - -[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 0)]] -void ff_6(Wrapper S1) { // expected-error {{cannot be used as the type of a kernel parameter}} - // expected-note@-1 {{'Wrapper' is not yet supported as a free function kernel parameter}} -} diff --git a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp index 6b2ce97001735..ed1f277c96641 100644 --- a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp +++ b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp @@ -10,6 +10,22 @@ extern "C" { #include "InstrProfiling.h" +void __sycl_increment_profile_counters(uint64_t FnHash, size_t NumCounters, + const uint64_t *Increments) { + for (const __llvm_profile_data *DataVar = __llvm_profile_begin_data(); + DataVar < __llvm_profile_end_data(); DataVar++) { + if (DataVar->NameRef != FnHash || DataVar->NumCounters != NumCounters) + continue; + + uint64_t *const Counters = reinterpret_cast( + reinterpret_cast(DataVar) + + reinterpret_cast(DataVar->CounterPtr)); + for (size_t i = 0; i < NumCounters; i++) + Counters[i] += Increments[i]; + break; + } +} + static int RegisterRuntime() { __llvm_profile_initialize(); #ifdef _AIX diff --git a/devops/actions/run-tests/cts/action.yml b/devops/actions/run-tests/cts/action.yml index 67c27bfd11da5..6472fcca9edcc 100644 --- a/devops/actions/run-tests/cts/action.yml +++ b/devops/actions/run-tests/cts/action.yml @@ -95,7 +95,7 @@ runs: - name: SYCL CTS List devices # Proceed with execution even if the 'build' step did not succeed. - if: ${{ !cancelled()) && inputs.testing_mode != 'build-only' }} + if: ${{ !cancelled() && inputs.testing_mode != 'build-only' }} shell: bash env: ONEAPI_DEVICE_SELECTOR: ${{ inputs.target_devices }} @@ -129,7 +129,7 @@ runs: - name: Run SYCL CTS tests # Proceed with execution even if the previous two steps did not succeed. - if: ${{ !cancelled()) && inputs.testing_mode != 'build-only' }} + if: ${{ !cancelled() && inputs.testing_mode != 'build-only' }} env: ONEAPI_DEVICE_SELECTOR: ${{ inputs.target_devices }} # By-default GitHub actions execute the "run" shell script with -e option, diff --git a/devops/actions/run-tests/e2e/action.yml b/devops/actions/run-tests/e2e/action.yml index c78935eab8eb5..f277e02518359 100644 --- a/devops/actions/run-tests/e2e/action.yml +++ b/devops/actions/run-tests/e2e/action.yml @@ -106,3 +106,8 @@ runs: name: ${{ inputs.binaries_artifact }} path: e2e_binaries.tar.zst retention-days: ${{ inputs.retention-days }} + - name: Cleanup E2E tests + if: ${{ !cancelled() }} + shell: bash + run: | + rm -rf build-e2e diff --git a/devops/actions/run-tests/windows/cts/action.yml b/devops/actions/run-tests/windows/cts/action.yml index ac605bd4bacd9..83d753c4ae161 100644 --- a/devops/actions/run-tests/windows/cts/action.yml +++ b/devops/actions/run-tests/windows/cts/action.yml @@ -97,7 +97,7 @@ runs: - name: SYCL CTS List devices # Proceed with execution even if the 'build' step did not succeed. - if: ${{ !cancelled()) && inputs.testing_mode != 'build-only' }} + if: ${{ !cancelled() && inputs.testing_mode != 'build-only' }} shell: bash env: ONEAPI_DEVICE_SELECTOR: ${{ inputs.target_devices }} @@ -131,7 +131,7 @@ runs: - name: Run SYCL CTS tests # Proceed with execution even if the previous two steps did not succeed. - if: ${{ !cancelled()) && inputs.testing_mode != 'build-only' }} + if: ${{ !cancelled() && inputs.testing_mode != 'build-only' }} env: ONEAPI_DEVICE_SELECTOR: ${{ inputs.target_devices }} # By-default GitHub actions execute the "run" shell script with -e option, diff --git a/devops/actions/run-tests/windows/e2e/action.yml b/devops/actions/run-tests/windows/e2e/action.yml index 5400db21a7cf3..0a3ac8d92df04 100644 --- a/devops/actions/run-tests/windows/e2e/action.yml +++ b/devops/actions/run-tests/windows/e2e/action.yml @@ -121,3 +121,7 @@ runs: name: ${{ inputs.binaries_artifact }} path: e2e_bin.tar.gz retention-days: ${{ inputs.retention-days }} + - name: Cleanup E2E tests + if: ${{ !cancelled() }} + shell: bash + run: rm -rf build-e2e diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index 9d8523dd1ad16..02e52f01e3266 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -269,6 +269,7 @@ def benchmarks(self) -> list[Benchmark]: ) ) + # Add RecordAndReplay benchmarks record_and_replay_params = product([0, 1], [0, 1]) for emulate, instantiate in record_and_replay_params: @@ -315,6 +316,39 @@ def createRrBench(variant_name: str, **kwargs): ), ] + # Add TorchMultiQueue benchmarks + for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES): + + def createTorchMultiQueueBench(variant_name: str, **kwargs): + return TorchMultiQueue( + self, + runtime, + variant_name, + PROFILERS.TIMER, + **kwargs, + ) + + benches += [ + createTorchMultiQueueBench( + "large", + workgroupCount=4096, + workgroupSize=512, + kernelsPerQueue=20, + ), + createTorchMultiQueueBench( + "medium", + workgroupCount=512, + workgroupSize=256, + kernelsPerQueue=10, + ), + createTorchMultiQueueBench( + "small", + workgroupCount=256, + workgroupSize=124, + kernelsPerQueue=4, + ), + ] + # Add UR-specific benchmarks benches += [ # TODO: multithread_benchmark_ur fails with segfault @@ -770,6 +804,48 @@ def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: return [f"--{k}={v}" for k, v in self._rr_params.items()] +class TorchMultiQueue(ComputeBenchmark): + def __init__( + self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs + ): + self._variant_name = variant_name + self._smq_params = kwargs + self._iterations_regular = 1000 + self._iterations_trace = 10 + super().__init__( + suite, + f"torch_benchmark_{runtime.value}", + "KernelSubmitMultiQueue", + runtime, + profiler_type, + ) + + def name(self): + ret = [] + for k, v in self._smq_params.items(): + ret.append(f"{k} {v}") + ret.sort() + return self._bench_name + " " + ", ".join(ret) + + def display_name(self) -> str: + return f"{self.explicit_group()} {self._runtime.value}" + + def explicit_group(self): + return f"{self._test} {self._variant_name}" + + def get_tags(self): + return ["pytorch", runtime_to_tag_name(self._runtime)] + + def _supported_runtimes(self) -> list[RUNTIMES]: + return super()._supported_runtimes() + [RUNTIMES.SYCL_PREVIEW] + + def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: + iters = self._get_iters(run_trace) + return [f"--iterations={iters}"] + [ + f"--{k}={v}" for k, v in self._smq_params.items() + ] + + class QueueInOrderMemcpy(ComputeBenchmark): def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type): self._is_copy_only = isCopyOnly diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py index fc6c4482463dd..6b546f80cb3a4 100644 --- a/devops/scripts/benchmarks/tests/test_integration.py +++ b/devops/scripts/benchmarks/tests/test_integration.py @@ -188,6 +188,26 @@ def test_submit_kernel(self): {"L0", "latency", "micro", "submit"}, ) + def test_torch_l0(self): + self._checkCase( + "torch_benchmark_l0 kernelsPerQueue 20, workgroupCount 4096, workgroupSize 512", + "KernelSubmitMultiQueue large", + {"pytorch", "L0"}, + ) + + def test_torch_sycl(self): + self._checkCase( + "torch_benchmark_sycl kernelsPerQueue 10, workgroupCount 512, workgroupSize 256", + "KernelSubmitMultiQueue medium", + {"pytorch", "SYCL"}, + ) + + def test_torch_syclpreview(self): + self._checkCase( + "torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 124", + "KernelSubmitMultiQueue small", + {"pytorch", "SYCL"}, + ) if __name__ == "__main__": unittest.main() diff --git a/lldb/tools/lldb-dap/package-lock.json b/lldb/tools/lldb-dap/package-lock.json index a9ee377615a2f..fb0cb76408d82 100644 --- a/lldb/tools/lldb-dap/package-lock.json +++ b/lldb/tools/lldb-dap/package-lock.json @@ -2183,48 +2183,48 @@ } }, "node_modules/jsonwebtoken/node_modules/jwa": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/jwa/-/jwa-1.4.1.tgz", - "integrity": "sha512-qiLX/xhEEFKUAJ6FiBMbes3w9ATzyk5W7Hvzpa/SLYdxNtng+gcurvrI7TbACjIXlsJyr05/S1oUhZrc63evQA==", + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-1.4.2.tgz", + "integrity": "sha512-eeH5JO+21J78qMvTIDdBXidBd6nG2kZjg5Ohz/1fpa28Z4CcsWUzJ1ZZyFq/3z3N17aZy+ZuBoHljASbL1WfOw==", "dev": true, "license": "MIT", "dependencies": { - "buffer-equal-constant-time": "1.0.1", + "buffer-equal-constant-time": "^1.0.1", "ecdsa-sig-formatter": "1.0.11", "safe-buffer": "^5.0.1" } }, "node_modules/jsonwebtoken/node_modules/jws": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/jws/-/jws-3.2.2.tgz", - "integrity": "sha512-YHlZCB6lMTllWDtSPHz/ZXTsi8S00usEV6v1tjq8tOUZzw7DpSDWVXjXDre6ed1w/pd495ODpHZYSdkRTsa0HA==", + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/jws/-/jws-3.2.3.tgz", + "integrity": "sha512-byiJ0FLRdLdSVSReO/U4E7RoEyOCKnEnEPMjq3HxWtvzLsV08/i5RQKsFVNkCldrCaPr2vDNAOMsfs8T/Hze7g==", "dev": true, "license": "MIT", "dependencies": { - "jwa": "^1.4.1", + "jwa": "^1.4.2", "safe-buffer": "^5.0.1" } }, "node_modules/jwa": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz", - "integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", "dev": true, "license": "MIT", "dependencies": { - "buffer-equal-constant-time": "1.0.1", + "buffer-equal-constant-time": "^1.0.1", "ecdsa-sig-formatter": "1.0.11", "safe-buffer": "^5.0.1" } }, "node_modules/jws": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz", - "integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", "dev": true, "license": "MIT", "dependencies": { - "jwa": "^2.0.0", + "jwa": "^2.0.1", "safe-buffer": "^5.0.1" } }, @@ -2669,6 +2669,7 @@ "integrity": "sha512-e9MewbtFo+Fevyuxn/4rrcDAaq0IYxPGLvObpQjiZBMAzB9IGmzlnG9RZy3FFas+eBMu2vA0CszMeduow5dIuQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 9470a69cb00ac..c1f7e058cdae4 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -1,3 +1,10 @@ +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # We append -fno-lifetime-dse in HandleLLVMOptions.cmake + # append("-fno-lifetime-dse" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + # But it is causing link failure with llvm::StdThreadPool::asyncEnqueue + string(REPLACE "-fno-lifetime-dse" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif () + add_llvm_component_library(LLVMLTO LTO.cpp LTOBackend.cpp diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 8c8d16a6e3d25..a99c53df1f308 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -994,6 +994,9 @@ bool InstrLowerer::lower() { if (!NeedsRuntimeHook && ContainsProfiling) emitRuntimeHook(); + if (M.getTargetTriple().isSPIR()) + return true; + emitRegistration(); emitUses(); emitInitialization(); @@ -1108,6 +1111,18 @@ GlobalVariable *InstrLowerer::getOrCreateBiasVar(StringRef VarName) { } Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) { + if (M.getTargetTriple().isSPIR()) { + auto *Counters = getOrCreateRegionCounters(I); + IRBuilder<> Builder(I); + auto *Addr = Builder.CreateLoad(PointerType::get(M.getContext(), 1), + Counters, "pgocount.addr"); + const std::uint64_t Index = I->getIndex()->getZExtValue(); + if (Index == 0) + return Addr; + auto *Offset = Builder.getInt64(Index * sizeof(std::uint64_t)); + return Builder.CreatePtrAdd(Addr, Offset, "pgocount.offset"); + } + auto *Counters = getOrCreateRegionCounters(I); IRBuilder<> Builder(I); @@ -1648,6 +1663,22 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) { GlobalVariable * InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name, GlobalValue::LinkageTypes Linkage) { + if (M.getTargetTriple().isSPIR()) { + uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); + auto &Ctx = M.getContext(); + auto *PtrTy = PointerType::get(Ctx, 1); + auto *IntTy = Type::getInt64Ty(Ctx); + auto *StructTy = StructType::get(Ctx, {PtrTy, IntTy}); + GlobalVariable *GV = new GlobalVariable( + M, StructTy, false, Linkage, Constant::getNullValue(StructTy), Name); + const std::uint64_t FnHash = IndexedInstrProf::ComputeHash( + getPGOFuncNameVarInitializer(Inc->getName())); + const std::string FnName = std::string{"__profc_"} + std::to_string(FnHash); + GV->addAttribute("sycl-unique-id", FnName); + GV->addAttribute("sycl-device-global-size", Twine(NumCounters * 8).str()); + return GV; + } + uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); auto &Ctx = M.getContext(); GlobalVariable *GV; diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll b/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll new file mode 100644 index 0000000000000..e2e5688432e0e --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll @@ -0,0 +1,29 @@ +; RUN: opt < %s -passes=instrprof -S | FileCheck %s + +target triple = "spir64-unknown-unknown" + +@__profn_foo = private constant [3 x i8] c"foo" +; CHECK: @__profc_foo = private global { ptr addrspace(1), i64 } zeroinitializer, section "__llvm_prf_cnts", comdat #0 +; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) +@__profn_bar = private constant [3 x i8] c"bar" +; CHECK: @__profc_bar = private global { ptr addrspace(1), i64 } zeroinitializer, section "__llvm_prf_cnts", comdat #1 +; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) + +; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names" + +define void @_Z3foov() { + call void @llvm.instrprof.cover(ptr @__profn_foo, i64 12345678, i32 1, i32 0) + ; CHECK: %pgocount.addr = load ptr addrspace(1), ptr @__profc_foo, align 8 + ; CHECK: store i8 0, ptr addrspace(1) %pgocount.addr, align 1 + ret void +} + +%class.A = type { ptr } +define dso_local void @_Z3barv(ptr nocapture nonnull align 8 %0) unnamed_addr #0 align 2 { + call void @llvm.instrprof.cover(ptr @__profn_bar, i64 87654321, i32 1, i32 0) + ; CHECK: %pgocount.addr = load ptr addrspace(1), ptr @__profc_bar, align 8 + ; CHECK: store i8 0, ptr addrspace(1) %pgocount.addr, align 1 + ret void +} + +declare void @llvm.instrprof.cover(ptr, i64, i32, i32) diff --git a/sycl/doc/design/DeviceCodeCoverage.md b/sycl/doc/design/DeviceCodeCoverage.md new file mode 100644 index 0000000000000..623023e703ac5 --- /dev/null +++ b/sycl/doc/design/DeviceCodeCoverage.md @@ -0,0 +1,71 @@ +# Design for Device-side Code Coverage + +## Overview + +This document describes the design and implementation of device-side code coverage for SYCL, extending Clang's source-based code coverage to support device code. The approach leverages the existing SYCL device global infrastructure, as detailed in the [DeviceGlobal.md](DeviceGlobal.md) design document, to enable collection and aggregation of coverage data from device kernels. + +## Design Details + +### Profiling Counter Representation + +Profiling counters for code coverage are lowered by the compiler as device globals. Specifically, the `InstrProfilingLoweringPass` is modified so that, when targeting SPIR-V, coverage counters are represented as pointers to USM buffers, matching the representation of other SYCL device globals. This indirection allows counters to be relocatable and managed consistently with other device-side global variables. + +Each counter is annotated with a unique identifier (`sycl-unique-id`) of the form `__profc_`, where `` is a 64-bit unsigned integer uniquely identifying the instrumented function. The counter's size is also recorded via the `sycl-device-global-size` attribute. These attributes ensure that counters are discoverable and manageable by the SYCL runtime and integration headers/footers. + +The profile counter device global is represented as an array of 8-byte integers (`std::uint64_t`). The number of elements in this array corresponds to the number of regions in the function being instrumented, where a region typically represents a distinct code branch or block. The size of the device global variable is therefore determined by multiplying the number of regions by eight bytes, and this value is recorded in the `sycl-device-global-size` attribute for use by the runtime and integration logic. + +### Integration with Device Global Infrastructure + +The device global infrastructure, as described in [DeviceGlobal.md](DeviceGlobal.md), provides mechanisms for mapping host and device instances of global variables, managing their lifetimes, and facilitating data transfer. Device-side coverage counters are treated as a special class of device globals: + +- They use the shared allocation type rather than the device allocation type for the underlying USM memory. +- They do not have corresponding `device_global` declarations in host code. +- Their lifetime and cleanup are managed via the device global map, with integration footer code ensuring registration and deregistration. + +### Runtime Handling and Data Aggregation + +When a device global entry corresponding to a coverage counter is released (e.g., when a device image is unloaded), the SYCL runtime aggregates the values from the device-side counter into the equivalent host-side counter. Equivalence is determined by matching both the `` and the number of counter regions. If no matching host-side counter exists—typically due to differences in code between host and device caused by the `__SYCL_DEVICE_ONLY__` macro—the device-side counter values are discarded. + +The aggregation is performed by invoking a new function in the compiler runtime, `__sycl_increment_profile_counters`, which is weakly linked to accommodate optional runtime availability. This function accepts the ``, the number of regions, and the increment values, and updates the host-side counters accordingly. At program exit, the final profile data reflects the sum of host and device coverage counters. + +### Compiler and Runtime Changes + +#### Compiler Frontend + +- The lowering pass for coverage counters is updated to emit device globals with the appropriate attributes and indirection. +- Integration headers and footers are updated to register device global counters with the runtime, using the unique identifier and size. + +#### SYCL Runtime + +- Device globals with IDs matching the `__profc_` pattern are recognized as coverage counters. +- USM allocation and management for counters is handled as for other device globals, but without host-side declarations. +- Upon cleanup, device-side counter values are aggregated into host-side counters via the runtime API. + +#### Compiler Runtime + +- The new function `__sycl_increment_profile_counters` is introduced to update host-side counters. +- The function is weakly linked to allow for optional inclusion. + +### Limitations and Considerations + +- The feature is currently implemented only for SPIR-V targets; CUDA and HIP backends are not supported. +- Devices lacking support for device globals cannot utilize device-side code coverage. +- Differences in code between host and device (e.g., due to `__SYCL_DEVICE_ONLY__`) may prevent aggregation of coverage data for some functions. +- The design relies on the robustness of the device global infrastructure for correct mapping and lifetime management. + +## Relationship to Device Global Design + +This feature is built upon the mechanisms described in [DeviceGlobal.md](DeviceGlobal.md), including: + +- Use of unique string identifiers (`sycl-unique-id`) for mapping and management. +- USM-based allocation and zero-initialization of device-side storage. +- Integration header/footer registration for host-device correlation. +- Runtime database for device global management and lookup. + +The code coverage counters are a specialized use case of device globals, with additional logic for aggregation and profile generation. + +## References + +- [Implementation design for SYCL device globals](DeviceGlobal.md) +- [Clang Source-based Code Coverage](https://clang.llvm.org/docs/SourceBasedCodeCoverage.html) +- [SYCL Specification](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html) diff --git a/sycl/doc/design/spirv-extensions/SPV_INTEL_float4.asciidoc b/sycl/doc/design/spirv-extensions/SPV_INTEL_float4.asciidoc new file mode 100644 index 0000000000000..0ea88f7bdb13e --- /dev/null +++ b/sycl/doc/design/spirv-extensions/SPV_INTEL_float4.asciidoc @@ -0,0 +1,258 @@ +:extension_name: SPV_INTEL_float4 + +:hf4_capability_name: Float4E2M1INTEL +:hf4_capability_token: 6212 +:hf4_matrix_capability_name: Float4E2M1CooperativeMatrixINTEL +:hf4_matrix_capability_token: 6213 +:hf4_encoding: 6214 + +:khr_matrix_capability_name: CooperativeMatrixKHR + +:joint_matrix_url: https://https://github.com/intel/llvm/tree/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc +:fp_conv_url: https://github.com/intel/llvm/tree/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_fp_conversions.asciidoc +:coop_matrix_url: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_cooperative_matrix.html +:bfloat16_url: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_bfloat16.html +:fp8_url: https://github.khronos.org/SPIRV-Registry/extensions/EXT/SPV_EXT_float8.html + +{extension_name} +================ + + +== Name Strings + +{extension_name} + +== Contributors + +- Dmitry Sidorov, Intel + +- Victor Mustya, Intel + +- Ben Ashbaugh, Intel + +- Dounia Khaldi, Intel + +- Joe Garvey, Intel + +- Greg Lueck, Intel + +- Pawel Jurek, Intel + + +Notice +------ + +Copyright (c) 2025 Intel Corporation. All rights reserved. + +Status +------ + +* Working Draft + +This is a preview extension specification, intended to provide early access to +a feature for review and community feedback. When the feature matures, this +specification may be released as a formal extension. + +Because the interfaces defined by this specification are not final and are +subject to change they are not intended to be used by shipping software +products. If you are interested in using this feature in your software product, +please let us know! + +== Version + +[width="40%",cols="25,25"] +|======================================== +| Last Modified Date | 2025-10-24 +| Revision | 2 +|======================================== + +== Dependencies + +This extension is written against the SPIR-V Specification, +Version 1.6 Revision 4. + +This extension interacts with {coop_matrix_url}[*SPV_KHR_cooperative_matrix*] extension. + +This extension interacts with {joint_matrix_url}[*SPV_INTEL_joint_matrix*] extension. + +This extension interacts with {bfloat16_url}[*SPV_KHR_bfloat16*] extension. + +This extension interacts with {fp8_url}[*SPV_EXT_float8*] extension. + +This extension interacts with {fp_conv_url}[*SPV_INTEL_fp_conversions*] extension. + +This extension requires SPIR-V 1.0. + +Overview +-------- + +This extension extends the *OpTypeFloat* instruction to enable the definition of `FP4E2M1` +floating-point format that has one sign bit, two exponent bits and one mantissa bits. + +The `FP4E2M1` special values are defined by the table below. + +[options="header"] +[width="80%"] +[cols="1,2"] +|==== +| ^| `FP4E2M1` +| Exponent Bias | 1 +| Max normal +| S.11.1 = 6.0 (1.5 * 2^2^) + +| Min normal +| S.01.0 = 1.0 (1.0 * 2^0^) + +| Max subnormal +| S.00.1 = 0.5 (0.5 * 2^0^) + +| Min subnormal +| S.00.1 = 0.5 (0.5 * 2^0^) + +| Infinity | N/A +| NaN | N/A + +|==== + +== Modifications to the SPIR-V Specification, Version 1.6 + +Binary Form +~~~~~~~~~~~ + +FP Encoding +~~~~~~~~~~~ + +Add a new enum: + +-- +[cols="^2,14,2,4",options="header",width = "100%"] +|==== +2+^.^| FP Encoding | Width(s) | Enabling Capabilities +| {hf4_encoding} | *Float4E2M1INTEL* + +The floating point type is encoded as a 4-bit float type. +This is encoded with the following encoding parameters: + + + - _bias_ is 1 + + + - _sign bit_ is 1 + + + - _w_ (exponent) is 2 + + + - _t_ (significand) is 1 + + + - _k_ (width) is 4 +| 4 | *Float4E2M1INTEL* + +|=== +-- + +=== Capabilities + +Modify Section 3.31, Capability, adding rows to the Capability table: + +-- +[options="header"] +|==== +2+^| Capability ^| Implicitly Declares +| {hf4_capability_token} | *{hf4_capability_name}* + +Uses *Float4E2M1INTEL* floating-point encoding. + +| +| {hf4_matrix_capability_token} | *{hf4_matrix_capability_name}* | *{khr_matrix_capability_name}* +|==== +-- + +=== Memory Layout + +Add to Section 2.18.1. Memory Layout, FPE2M1 4 layout: + +Scalar floating point variables with a `Width` of 4 can only be declared in the `Private` or `Function` storage classes. +In other storage classes, they must be included in an `OpTypeVector` with an even `Component Count`, where the first component in every pair is in bits 0-3 of the corresponding byte, and the second component is in bits 4-7. + +=== Instructions + +==== 3.42.11. Conversion Instructions + +* Add the following paragraphs to *OpFConvert*: + + +When converting to floating-point values with the *Float4E2M1INTEL* encoding, out-of-range +values and infinity and are converted to largest representable finite value with a matching sign. +Conversion from NaNs is implementation-defined. + + + + +==== 3.49.6. Type-Declaration Instructions + +Add the following requirement to *OpTypeCooperativeMatrixKHR*: + +If _Component Type_ has a *Float4E2M1INTEL* encoding then *{hf4_matrix_capability_name}* must be declared. + +Validation Rules +~~~~~~~~~~~~~~~~ + +Add the following bullets to section 2.16.1, Universal Validation Rules: + + * Variables with a type that is or includes a floating-point type with the *Float4E2M1INTEL* encoding must only be used with the following instructions: + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_miscellaneous_instructions[Miscellaneous Instructions] : + *** OpUndef + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_constant_creation_instructions[Constant Creation Instructions] : + *** OpConstant + *** OpConstantNull + *** OpConstantOp + *** OpConstantComposite + *** OpConstantCompositeContinuedINTEL + *** OpCooperativeMatrixConstructCheckedINTEL + *** OpSpecConstant + *** OpSpecConstantOp + *** OpSpecConstantComposite + *** OpSpecConstantCompositeContinuedINTEL + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_arithmetic_instructions[Arithmetic Instructions] : + *** OpCooperativeMatrixMulAddKHR + *** OpCooperativeMatrixMulAddScaledINTEL + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_composite_instructions[Composite Instructions] : + *** OpVectorExtractDynamic + *** OpVectorInsertDynamic + *** OpVectorShuffle + *** OpCompositeConstruct + *** OpCompositeExtract + *** OpCompositeInsert + *** OpCopyObject + *** OpCopyLogical + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_memory_instructions[Memory Instructions] : + *** OpPtrEqual + *** OpPtrNotEqual + *** OpPtrDiff + *** OpCooperativeMatrixLoadKHR + *** OpCooperativeMatrixStoreKHR + *** OpCooperativeMatrixLoadCheckedINTEL + *** OpCooperativeMatrixStoreCheckedINTEL + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_function_instructions[Function Instructions] : + *** OpFunction + *** OpFunctionParameter + *** OpFunctionCall + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_conversion_instructions[Conversion Instructions] : + *** OpConvertSToF + *** OpFConvert + *** OpConvertPtrToU + *** OpConvertUToPtr + *** OpPtrCastToGeneric + *** OpGenericCastToPtr + *** OpGenericCastToPtrExplicit + *** OpBitcast + *** OpClampConvertFToFINTEL + *** OpBiasedRoundFToFINTEL + *** OpClampBiasedRoundFToFINTEL + *** OpBiasedRoundFToSINTEL + ** https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_control_flow_instructions[Control-Flow Instructions] : + *** OpReturnValue + *** OpSelect + *** OpPhi + *** OpLifetimeStart + *** OpLifetimeStop + +=== Issues + +- + +Revision History +---------------- + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|======================================== +|Rev|Date|Author|Changes +|1|2024-06-15|Dmitry Sidorov|Initial revision +|2|2025-10-24|Dmitry Sidorov|Prepare to publish +|======================================== diff --git a/sycl/doc/design/spirv-extensions/SPV_INTEL_fp_conversions.asciidoc b/sycl/doc/design/spirv-extensions/SPV_INTEL_fp_conversions.asciidoc new file mode 100644 index 0000000000000..6fa3f49867879 --- /dev/null +++ b/sycl/doc/design/spirv-extensions/SPV_INTEL_fp_conversions.asciidoc @@ -0,0 +1,322 @@ +:extension_name: SPV_INTEL_fp_conversions + +:convert_capability_name: FloatConversionsINTEL +:convert_capability_token: 6215 +:OpClampConvertFToFINTEL_token: 6216 +:OpClampConvertFToSINTEL_token: 6424 +:OpStochasticRoundFToFINTEL_token: 6217 +:OpClampStochasticRoundFToFINTEL_token: 6218 +:OpClampStochasticRoundFToSINTEL_token: 6219 + +:coop_matrix_url: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_cooperative_matrix.html +:bfloat16_url: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_bfloat16.html +:fp8_url: https://github.khronos.org/SPIRV-Registry/extensions/EXT/SPV_EXT_float8.html +:fp4_url: https://github.com/intel/llvm/tree/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_float4.asciidoc + +{extension_name} +================ + + +== Name Strings + +{extension_name} + +== Contributors + +- Dmitry Sidorov, Intel + +- Victor Mustya, Intel + +- Ben Ashbaugh, Intel + +- Dounia Khaldi, Intel + +- Joe Garvey, Intel + +- Greg Lueck, Intel + +- Pawel Jurek, Intel + +- John Lu, Intel + +- Eyal Radiano, Intel + +- Mateusz Garbowski, Intel + + +Notice +------ + +Copyright (c) 2025 Intel Corporation. All rights reserved. + +Status +------ + +* Working Draft + +This is a preview extension specification, intended to provide early access to +a feature for review and community feedback. When the feature matures, this +specification may be released as a formal extension. + +Because the interfaces defined by this specification are not final and are +subject to change they are not intended to be used by shipping software +products. If you are interested in using this feature in your software product, +please let us know! + +== Version + +[width="40%",cols="25,25"] +|======================================== +| Last Modified Date | 2025-10-24 +| Revision | 2 +|======================================== + +== Dependencies + +This extension is written against the SPIR-V Specification, +Version 1.6 Revision 4. + +This extension interacts with {coop_matrix_url}[*SPV_KHR_cooperative_matrix*] extension. + +This extension interacts with {bfloat16_url}[*SPV_KHR_bfloat16*] extension. + +This extension interacts with {fp8_url}[*SPV_EXT_float8*] extension. + +This extension interacts with {fp4_url}[*SPV_INTEL_float4*] extension. + +This extension requires SPIR-V 1.0. + +Overview +-------- + +== Modifications to the SPIR-V Specification, Version 1.6 + +=== Capabilities + +Modify Section 3.31, Capability, adding rows to the Capability table: + +-- +[options="header"] +|==== +2+^| Capability ^| Implicitly Declares +| {convert_capability_token} | *{convert_capability_name}* + +Uses *OpClampConvertFToFINTEL*, *OpStochasticRoundFToFINTEL*, *OpClampStochasticRoundFToFINTEL* and *OpClampStochasticRoundFToSINTEL* +instructions. + +| +|==== +-- + +=== Instructions + +==== 3.42.11. Conversion Instructions + +[cols="1a,1,3*3",width="100%"] +|===== +4+|[[OpClampConvertFToFINTEL]]*OpClampConvertFToFINTEL* + + + +Converts numerically one floating point value to another. +In case of overflow, the positive result clamps to maximum normal value. +The negative result clamps to lowest negative normal value, which is equal to +maximum normal value multiplied by -1. + + + +_Result Type_ is the type of the converted object, it must be a scalar or +vector of _float type_. + + + +_Value_ must be a scalar or vector of _float type_. It must have a wider range +than the _Result Type_ and it must have the same number of components as 'Result Type'. + + + +Results are computed per component. + + + +1+|Capability: + +*{convert_capability_name}* +1+| 4 | {OpClampConvertFToFINTEL_token} +| __ + +_Result Type_ +| _Result _ +| __ + +_Value_ +|===== + +[cols="1a,1,3*3",width="100%"] +|===== +4+|[[OpClampConvertFToSINTEL]]*OpClampConvertFToSINTEL* + + + +Converts numerically a floating point value to integer. +In case of overflow, the result is saturated to INT_MAX or INT_MIN depending on a sign bit. + + + +_Result Type_ is the type of the converted object, it must be a scalar or +vector of _integer type_. + + + +_Value_ must be a scalar or vector of _float type_. +It must have the same number of components as 'Result Type'. + + + +Results are computed per component. + + + +1+|Capability: + +*{convert_capability_name}* +1+| 4 | {OpClampConvertFToSINTEL_token} +| __ + +_Result Type_ +| _Result _ +| __ + +_Value_ +|===== + +[cols="1a,1,5*",width="100%"] +|===== +6+|[[OpStochasticRoundFToFINTEL]]*OpStochasticRoundFToFINTEL* + + + +Converts numerically one floating point value to another using stochastic rounding. + + +Stochastic rounding is performed by adding a pseudo-random bias value to the mantissa +of the converted value as follows. The bias is first added to the mantissa of the converted value. +If this causes the mantissa to overflow, then the exponent of the converted value +is increased by 1 and the mantissa bits are shifted right. The value is then converted +to the _Result Type_, rounding towards zero. If the exponent overflows when converting +to the _Result Type_, the result of the conversion is +/- Inf. If _Result Type_ doesn't have +Inf representation, then in case of overflow the result saturates to max normal value representable +by the type preserving the sign. + + + +As described above, each input requires a bias value in order to perform the conversion. +These bias values are generated by executing an implementation-defined algorithm +that produces pseudo-random values that uses _Seed_ as a starting point. This algorithm is +guaranteed to produce repeatable bias values when the same value is passed for _Seed_. + + + +The instruction also returns a value in _Next Seed_, which client code can use to generate +good quality random biases. If the client intends to call *OpStochasticRoundFToFINTEL* +again from the same kernel invocation, it should use this value as a new seed that it +passes as _Seed_ in that next call. + + + +_Result Type_ is the type of the converted object, it must be a scalar or +vector of _float type_. + + + +_Value_ must be a scalar or vector of _float type_. It must have a wider range +than the _Result Type_ and it must have the same number of components as 'Result Type'. + + + +_Seed_ must be a 32-bit scalar _integer type_. + + + +_Next Seed_ must be of a _pointer type_ with *Function* storage class and 32-bit scalar _integer_ element type. + + + +Results are computed per component. + + + + +1+|Capability: + +*{convert_capability_name}* +1+| 4+ | {OpStochasticRoundFToFINTEL_token} +| __ + +_Result Type_ +| _Result _ +| __ + +_Value_ +| __ + +_Seed_ +| Optional __ + +_Next Seed_ +|===== + + +[cols="1a,1,5*3",width="100%"] +|===== +6+|[[OpClampStochasticRoundFToFINTEL]]*OpClampStochasticRoundFToFINTEL* + + + +Has the same semantics as *OpStochasticRoundFToFINTEL*, with an addition, that +in case of overflow, the positive result clamps to maximum normal value. +The negative result clamps to lowest negative normal value, which is equal to +maximum normal value multiplied by -1. + +This instruction may be used for stochastic rounding operation, if a producer passes +pseudo-random _Seed_ value. + + + +_Result Type_ is the type of the converted object, it must be a scalar or +vector of _float type_. + + + +_Value_ must be a scalar or vector of _float type_. It must have a wider range +than the _Result Type_ and it must have the same number of components as 'Result Type'. + + + +_Seed_ must be a 32-bit scalar _integer type_. + + + +_Next Seed_ must be of a _pointer type_ with *Function* storage class and 32-bit scalar _integer_ element type. + + + +Results are computed per component. + + + +1+|Capability: + +*{convert_capability_name}* +1+| 5+ | {OpClampStochasticRoundFToFINTEL_token} +| __ + +_Result Type_ +| _Result _ +| __ + +_Value_ +| __ + +_Seed_ +| Optional __ + +_Next Seed_ +|===== + + +[cols="1a,1,5*3",width="100%"] +|===== +6+|[[OpClampStochasticRoundFToSINTEL]]*OpClampStochasticRoundFToSINTEL* + + + +Converts a floating point value to integer using stochastic rounding. +Has the same semantics as *OpStochasticRoundFToFINTEL*. +In case of overflow, the result is saturated to INT_MAX or INT_MIN depending on a sign bit. +This instruction may be used for stochastic rounding operation, if a producer +passes pseudo-random _Seed_ value. + + + +_Result Type_ is the type of the converted object, it must be a scalar or +vector of _integer type_. + + + +_Value_ must be a scalar or vector of _float type_. It must have a wider range +than the _Result Type_ and it must have the same number of components as 'Result Type'. + + + +_Seed_ must be a 32-bit scalar _integer type_. + + + +Results are computed per component. + + + +_Next Seed_ must be of a _pointer type_ with *Function* storage class and 32-bit scalar _integer_ element type. + + + +1+|Capability: + +*{convert_capability_name}* +1+| 4+ | {OpClampStochasticRoundFToSINTEL_token} +| __ + +_Result Type_ +| _Result _ +| __ + +_Value_ +| __ + +_Seed_ +| Optional __ + +_Next Seed_ +|===== + + +Validation Rules +~~~~~~~~~~~~~~~~ + +Add the following bullets to section 2.16.11, Universal Validation Rules: + + * Variables with a type that is or includes a floating-point type with the *BFloat16KHR*, *Float8E4M3EXT*, *Float8E5M2EXT* and *Float4E2M1INTEL* encodings can also be used with the following instructions: + ** *OpClampConvertFToFINTEL* + + * Variables with a type that is or includes a floating-point type with the *BFloat16KHR*, *Float8E4M3EXT*, *Float8E5M2EXT* and *Float4E2M1INTEL* encodings can also be used with the following instructions: + ** *OpStochasticRoundFToFINTEL* + ** *OpClampStochasticRoundFToFINTEL* + + * Variables with a type that is or includes a floating-point type with the *BFloat16KHR* encoding can also be used with the following instructions: + ** *OpClampConvertFToSINTEL* and *OpClampStochasticRoundFToSINTEL* + + +== Interactions with SPV_KHR_cooperative_matrix + +When *CooperativeMatrixKHR* capability is declared it is allowed to convert a _cooperative matrix_ +using the instructions added by this extensions. + +If _Value_ is _cooperative matrix_, then the _Result Type_ must be a _cooperative matrix type_ +with the same _Rows_, _Columns_, _Scope_ and _Use_ operands. _Seed_ operand can be non-uniform, all +other operands to these instructions must be dynamically +uniform within every instance of the _Scope_ of the _cooperative matrix_. + + +=== Issues + +- + +Revision History +---------------- + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|======================================== +|Rev|Date|Author|Changes +|1|2024-06-15|Dmitry Sidorov|Initial revision +|1|2025-10-24|Dmitry Sidorov|Prepare to publish +|======================================== diff --git a/sycl/doc/design/spirv-extensions/mini_float_conversions_env.asciidoc b/sycl/doc/design/spirv-extensions/mini_float_conversions_env.asciidoc new file mode 100644 index 0000000000000..a1be113041092 --- /dev/null +++ b/sycl/doc/design/spirv-extensions/mini_float_conversions_env.asciidoc @@ -0,0 +1,145 @@ +Mini-float Types and Conversions Environment Specification +========================================================== + +This document provides list of supported conversions for types and instructions +added in *SPV_EXT_float8*, *SPV_INTEL_int4*, *SPV_INTEL_float4* and *SPV_INTEL_fp_conversions* extensons +in Level-Zero and OpenCL Environments for Intel platforms. + + +Conversion from NaNs to `Float4E2M1INTEL` +----------------------------------------- + +NaNs are converted to largest representable finite value with a matching sign. + +Float to float conversions via OpFConvert +----------------------------------------- + +Conversions to *OpTypeFloat* with *Float4E2M1INTEL* encoding are being done with +round to the nearest even (RTE) mode by default. It's illegal to put any *FPRoundingMode* +decoration other than *RTE* on the instruction in these cases. *RoundingModeRTZ* +execution mode has no affect on these conversions. + + +Only the following conversions via *OpFConvert* to or from 4-bit floating-point values with the `Float4E2M1INTEL` encoding +are supported: + + + + +[cols="1,1", options="header"] +|=== +| To *Float4E2M1INTEL* 'Result' | From *Float4E2M1INTEL* 'Value' +| From 16-bit *IEEE754* | To 16-bit *IEEE754* +| From 16-bit *BFloat16KHR* | To 16-bit *BFloat16KHR* +| | To 8-bit *Float8E4M3EXT* +| | To 8-bit *Float8E5M2EXT* +|=== + +Only the following conversions via *OpFConvert* to or from 8-bit floating-point values with the `Float8E4M3EXT` and `Float8E5M2EXT` encodings +are supported: + + + + +[cols="1,1", options="header"] +|=== +| To *Float8E4M3EXT* 'Result' | From *Float8E4M3EXT* 'Value' +| From 16-bit *IEEE754* | To 16-bit *IEEE754* +| From 16-bit *BFloat16KHR* | To 16-bit *BFloat16KHR* +| From 4-bit *Float4E2M1INTEL* | +|=== + +[cols="1,1", options="header"] +|=== +| To *Float8E5M2EXT* 'Result' | From *Float8E5M2EXT* 'Value' +| From 16-bit *IEEE754* | To 16-bit *IEEE754* +| From 16-bit *BFloat16KHR* | To 16-bit *BFloat16KHR* +| From 4-bit *Float4E2M1INTEL* | +|=== + +Float to integer conversions via OpConvertFToS +---------------------------------------------- + +Only the following conversions via *OpConvertFToS* from float to 4-bit integer values are supported: + + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 4-bit integer | 16-bit *IEEE754* +| 4-bit integer | 16-bit *BFloat16KHR* +|=== + +Float to float conversions via OpClampConvertFToFINTEL +------------------------------------------------------ + +Only the following conversions via *OpClampConvertFToFINTEL* are supported: + + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 16-bit *IEEE754* | 32-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *BFloat16KHR* +| 8-bit *Float8E4M3EXT* | 16-bit *IEEE754* +| 8-bit *Float8E4M3EXT* | 16-bit *BFloat16KHR* +| 4-bit *Float4E2M1INTEL* | 16-bit *IEEE754* +| 4-bit *Float4E2M1INTEL* | 16-bit *BFloat16KHR* +|=== + +Float to integer conversions via OpClampConvertFToSINTEL +-------------------------------------------------------- + +Only the following conversions via *OpClampConvertFToSINTEL* from float to 4-bit integer values are supported: + + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 4-bit integer | 16-bit *IEEE754* +| 4-bit integer | 16-bit *BFloat16KHR* +|=== + +Float to float conversions via OpStochasticRoundFToFINTEL +--------------------------------------------------------- + +Only the following conversions via *OpStochasticRoundFToFINTEL* are supported: + + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 16-bit *IEEE754* | 32-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *BFloat16KHR* +| 8-bit *Float8E4M3EXT* | 16-bit *IEEE754* +| 8-bit *Float8E4M3EXT* | 16-bit *BFloat16KHR* +| 4-bit *Float4E2M1INTEL* | 16-bit *IEEE754* +| 4-bit *Float4E2M1INTEL* | 16-bit *BFloat16KHR* +|=== + +Float to float conversions via OpClampStochasticRoundFToFINTEL +-------------------------------------------------------------- + +Only the following conversions via *OpClampStochasticRoundFToFINTEL* are supported: + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 16-bit *IEEE754* | 32-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *IEEE754* +| 8-bit *Float8E5M2EXT* | 16-bit *BFloat16KHR* +| 8-bit *Float8E4M3EXT* | 16-bit *IEEE754* +| 8-bit *Float8E4M3EXT* | 16-bit *BFloat16KHR* +| 4-bit *Float4E2M1INTEL* | 16-bit *IEEE754* +| 4-bit *Float4E2M1INTEL* | 16-bit *BFloat16KHR* +|=== + + +Float to integer conversions via OpClampStochasticRoundFToSINTEL +---------------------------------------------------------------- + +Only the following conversions via *OpClampStochasticRoundFToSINTEL* from float to 4-bit integer values are supported: + + + + +[cols="1,1", options="header"] +|=== +| _Result_ | 'Value' +| 4-bit integer | 16-bit *IEEE754* +| 4-bit integer | 16-bit *BFloat16KHR* +|=== diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index fa885e8cdb000..8197a8dac38e8 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -39,6 +39,7 @@ Design Documents for the oneAPI DPC++ Compiler design/ParallelForRangeRounding design/SYCLInstrumentationUsingXPTI design/ITTAnnotations + design/DeviceCodeCoverage design/DeviceGlobal design/CompileTimeProperties design/HostPipes diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index ad0c7a31d3519..4f68506aaca03 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -838,19 +838,6 @@ __clc_BarrierTestWait(int64_t *state, int64_t arrival) noexcept; __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierArriveAndWait(int64_t *state) noexcept; -#if defined(__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__) && \ - !defined(__INTEL_PREVIEW_BREAKING_CHANGES) -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wpedantic" -#warning \ - "__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ is deprecated and will be removed in a future release." -#pragma clang diagnostic pop -#endif -extern __DPCPP_SYCL_EXTERNAL int -__spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...); -extern __DPCPP_SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...); -#else template extern __DPCPP_SYCL_EXTERNAL int __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, @@ -858,7 +845,6 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, template extern __DPCPP_SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, Args... args); -#endif // Native builtin extension diff --git a/sycl/include/sycl/accessor.hpp b/sycl/include/sycl/accessor.hpp index 3f48859aefed3..ad938b2685e07 100644 --- a/sycl/include/sycl/accessor.hpp +++ b/sycl/include/sycl/accessor.hpp @@ -516,6 +516,7 @@ using AccessorImplPtr = std::shared_ptr; class __SYCL_EXPORT AccessorBaseHost { protected: AccessorBaseHost(const AccessorImplPtr &Impl) : impl{Impl} {} + friend sycl::detail::ImplUtils; public: AccessorBaseHost(id<3> Offset, range<3> AccessRange, range<3> MemoryRange, @@ -550,16 +551,6 @@ class __SYCL_EXPORT AccessorBaseHost { void *getMemoryObject() const; - template - friend const decltype(Obj::impl) &getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend class accessor; @@ -574,6 +565,8 @@ class LocalAccessorImplHost; using LocalAccessorImplPtr = std::shared_ptr; class __SYCL_EXPORT LocalAccessorBaseHost { + friend sycl::detail::ImplUtils; + protected: LocalAccessorBaseHost(const LocalAccessorImplPtr &Impl) : impl{Impl} {} @@ -589,17 +582,6 @@ class __SYCL_EXPORT LocalAccessorBaseHost { const property_list &getPropList() const; protected: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - LocalAccessorImplPtr impl; }; } // namespace detail @@ -623,6 +605,8 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : public detail::OwnerLessBase< accessor> { + friend sycl::detail::ImplUtils; + protected: static_assert((AccessTarget == access::target::global_buffer || AccessTarget == access::target::constant_buffer || @@ -854,17 +838,6 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : friend class sycl::stream; friend class sycl::ext::intel::esimd::detail::AccessorPrivateProxy; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - public: // 4.7.6.9.1. Interface for buffer command accessors // value_type is defined as const DataT for read_only accessors, DataT @@ -2249,17 +2222,6 @@ class __SYCL_SPECIAL_CLASS local_accessor_base : return Result; } - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend class local_accessor; public: @@ -2474,6 +2436,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor access::placeholder::false_t>, public detail::OwnerLessBase> { + friend sycl::detail::ImplUtils; using local_acc = local_accessor_base(), @@ -2647,6 +2610,8 @@ template { + friend sycl::detail::ImplUtils; + protected: using AccessorT = accessor; @@ -2671,16 +2636,6 @@ class __SYCL_EBO host_accessor host_accessor(const detail::AccessorImplPtr &Impl) : accessor{Impl} {} - - template - friend const decltype(Obj::impl) &getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); #endif // __SYCL_DEVICE_ONLY__ public: diff --git a/sycl/include/sycl/accessor_image.hpp b/sycl/include/sycl/accessor_image.hpp index db57a0943266b..be5eb3a697ae4 100644 --- a/sycl/include/sycl/accessor_image.hpp +++ b/sycl/include/sycl/accessor_image.hpp @@ -77,6 +77,7 @@ class __SYCL_EXPORT UnsampledImageAccessorBaseHost { protected: UnsampledImageAccessorBaseHost(const UnsampledImageAccessorImplPtr &Impl) : impl{Impl} {} + friend sycl::detail::ImplUtils; public: UnsampledImageAccessorBaseHost(sycl::range<3> Size, access_mode AccessMode, @@ -97,18 +98,6 @@ class __SYCL_EXPORT UnsampledImageAccessorBaseHost { const property_list &getPropList() const; protected: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - UnsampledImageAccessorImplPtr impl; // The function references helper methods required by GDB pretty-printers @@ -151,6 +140,7 @@ class __SYCL_EXPORT SampledImageAccessorBaseHost { protected: SampledImageAccessorBaseHost(const SampledImageAccessorImplPtr &Impl) : impl{Impl} {} + friend sycl::detail::ImplUtils; public: SampledImageAccessorBaseHost(sycl::range<3> Size, void *SYCLMemObject, @@ -173,18 +163,6 @@ class __SYCL_EXPORT SampledImageAccessorBaseHost { const property_list &getPropList() const; protected: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - SampledImageAccessorImplPtr impl; // The function references helper methods required by GDB pretty-printers @@ -789,6 +767,7 @@ class __SYCL_EBO unsampled_image_accessor : #endif // __SYCL_DEVICE_ONLY__ public detail::OwnerLessBase< unsampled_image_accessor> { + friend sycl::detail::ImplUtils; static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, @@ -940,18 +919,6 @@ class __SYCL_EBO unsampled_image_accessor : { (void)Impl; } - - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; template > { + friend sycl::detail::ImplUtils; static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, @@ -1082,18 +1050,6 @@ class __SYCL_EBO host_unsampled_image_accessor host_unsampled_image_accessor( const detail::UnsampledImageAccessorImplPtr &Impl) : base_class{Impl} {} - - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; template > { + friend sycl::detail::ImplUtils; static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, @@ -1231,18 +1188,6 @@ class __SYCL_EBO sampled_image_accessor : { (void)Impl; } - - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; template @@ -1250,6 +1195,7 @@ class __SYCL_EBO host_sampled_image_accessor : private detail::SampledImageAccessorBaseHost, public detail::OwnerLessBase< host_sampled_image_accessor> { + friend sycl::detail::ImplUtils; static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, @@ -1340,18 +1286,6 @@ class __SYCL_EBO host_sampled_image_accessor private: host_sampled_image_accessor(const detail::SampledImageAccessorImplPtr &Impl) : base_class{Impl} {} - - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; } // namespace _V1 diff --git a/sycl/include/sycl/buffer.hpp b/sycl/include/sycl/buffer.hpp index 4c0d9575c7807..da1814996c807 100644 --- a/sycl/include/sycl/buffer.hpp +++ b/sycl/include/sycl/buffer.hpp @@ -93,6 +93,8 @@ struct BufferInterop; // The non-template base for the sycl::buffer class class __SYCL_EXPORT buffer_plain { + friend sycl::detail::ImplUtils; + protected: buffer_plain(size_t SizeInBytes, size_t, const property_list &Props, std::unique_ptr Allocator); @@ -730,9 +732,6 @@ class buffer : public detail::buffer_plain, } private: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); template friend class buffer; template backend_return_t; /// /// \ingroup sycl_api class __SYCL_EXPORT context : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a SYCL context instance using an instance of default_selector. /// @@ -255,17 +257,6 @@ class __SYCL_EXPORT context : public detail::OwnerLessBase { template friend auto get_native(const SyclT &Obj) -> backend_return_t; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - const property_list &getPropList() const; }; diff --git a/sycl/include/sycl/detail/common.hpp b/sycl/include/sycl/detail/common.hpp index a0476e21657a7..27e2b0560e81e 100644 --- a/sycl/include/sycl/detail/common.hpp +++ b/sycl/include/sycl/detail/common.hpp @@ -8,11 +8,6 @@ #pragma once -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -#ifndef __SYCL_DEVICE_ONLY__ -#include -#endif -#endif // #ifndef __INTEL_PREVIEW_BREAKING_CHANGES #include // for __SYCL_ALWAYS_INLINE #include // for __SYCL_EXPORT @@ -101,14 +96,8 @@ struct code_location { private: const char *MFileName; const char *MFunctionName; -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - // For preserving layout of handler class - unsigned long MLineNo; - unsigned long MColumnNo; -#else uint32_t MLineNo; uint32_t MColumnNo; -#endif }; /// @brief Data type that manages the code_location information in TLS @@ -151,22 +140,9 @@ class __SYCL_EXPORT tls_code_loc_t { /// @param CodeLoc The code location information to set up the TLS slot with. tls_code_loc_t(const detail::code_location &CodeLoc); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES // Used to maintain global state (GCodeLocTLS), so we do not want to copy tls_code_loc_t(const tls_code_loc_t &) = delete; tls_code_loc_t &operator=(const tls_code_loc_t &) = delete; -#else - tls_code_loc_t &operator=(const tls_code_loc_t &) { - // Should never be called. In PREVIEW we marked it as deleted, but - // before ABI breaking change we need to keep it for backward compatibility. - assert(false && "tls_code_loc_t should not be copied"); -#ifndef __SYCL_DEVICE_ONLY__ - throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), - "tls_code_loc_t should not be copied"); -#endif - return *this; - } -#endif // __INTEL_PREVIEW_BREAKING_CHANGES /// If the code location is set up by this instance, reset it. ~tls_code_loc_t(); @@ -179,10 +155,8 @@ class __SYCL_EXPORT tls_code_loc_t { bool isToplevel() const { return !MLocalScope; } private: -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES // Cache the TLS location to decrease amount of TLS accesses. detail::code_location &CodeLocTLSRef; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES // The flag that is used to determine if the object is in a local scope or in // the top level scope. bool MLocalScope = true; diff --git a/sycl/include/sycl/detail/impl_utils.hpp b/sycl/include/sycl/detail/impl_utils.hpp index 00f61558a88d4..f5307b03d5d52 100644 --- a/sycl/include/sycl/detail/impl_utils.hpp +++ b/sycl/include/sycl/detail/impl_utils.hpp @@ -8,55 +8,48 @@ #pragma once -#include // for assert -#include // for hash -#include // for add_pointer_t -#include // for forward +#include +#include +#include +#include +#include namespace sycl { inline namespace _V1 { +class handler; namespace detail { +// Note! This class relies on the fact that all SYCL interface +// classes contain "impl" field that points to implementation object. "impl" +// field should be accessible from this class. +struct ImplUtils { + // Helper function for extracting implementation from SYCL's interface + // objects. + template + static const decltype(Obj::impl) &getSyclObjImpl(const Obj &SyclObj) { + assert(SyclObj.impl && "every constructor should create an impl"); + return SyclObj.impl; + } -// Helper function for extracting implementation from SYCL's interface objects. -// Note! This function relies on the fact that all SYCL interface classes -// contain "impl" field that points to implementation object. "impl" field -// should be accessible from this function. -// -// Note that due to a bug in MSVC compilers (including MSVC2019 v19.20), it -// may not recognize the usage of this function in friend member declarations -// if the template parameter name there is not equal to the name used here, -// i.e. 'Obj'. For example, using 'Obj' here and 'T' in such declaration -// would trigger that error in MSVC: -// template -// friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); -template -const decltype(Obj::impl) &getSyclObjImpl(const Obj &SyclObject) { - assert(SyclObject.impl && "every constructor should create an impl"); - return SyclObject.impl; -} - -// Helper function for creation SYCL interface objects from implementations. -// Note! These functions rely on the fact that all SYCL interface classes -// contain "impl" field that points to implementation object. "impl" field -// should be accessible from these functions. -template -T createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj) { - return T(std::forward(ImplObj)); -} + // Helper function for creation SYCL interface objects from implementations. + template + static SyclObject createSyclObjFromImpl(From &&from) { + if constexpr (std::is_same_v>>) + return SyclObject{from.shared_from_this()}; + else + return SyclObject{std::forward(from)}; + } +}; -template -T createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj) { - return T(ImplObj); +template +auto getSyclObjImpl(const Obj &SyclObj) + -> decltype(ImplUtils::getSyclObjImpl(SyclObj)) { + return ImplUtils::getSyclObjImpl(SyclObj); } -template -T createSyclObjFromImpl( - std::add_lvalue_reference_t()))>::element_type> - ImplRef) { - return createSyclObjFromImpl(ImplRef.shared_from_this()); +template +SyclObject createSyclObjFromImpl(From &&from) { + return ImplUtils::createSyclObjFromImpl(std::forward(from)); } template struct sycl_obj_hash { diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp index 2e6f5fdad5f80..e3134accc29f2 100644 --- a/sycl/include/sycl/detail/kernel_desc.hpp +++ b/sycl/include/sycl/detail/kernel_desc.hpp @@ -61,7 +61,8 @@ enum class kernel_param_kind_t { kind_work_group_memory = 6, kind_dynamic_work_group_memory = 7, kind_dynamic_accessor = 8, - kind_invalid = 0xf, // not a valid kernel kind + kind_struct_with_special_type = 9, // structs that contain special types + kind_invalid = 0xf, // not a valid kernel kind }; // describes a kernel parameter diff --git a/sycl/include/sycl/detail/string_view.hpp b/sycl/include/sycl/detail/string_view.hpp index d394de736d847..5a13c324605cb 100644 --- a/sycl/include/sycl/detail/string_view.hpp +++ b/sycl/include/sycl/detail/string_view.hpp @@ -22,47 +22,29 @@ namespace detail { class string_view { const char *str = nullptr; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES size_t len = 0; -#endif public: constexpr string_view() noexcept = default; constexpr string_view(const string_view &strn) noexcept = default; constexpr string_view(string_view &&strn) noexcept = default; constexpr string_view(std::string_view strn) noexcept - : str(strn.data()) -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - , - len(strn.size()) -#endif - { - } + : str(strn.data()), len(strn.size()) {} string_view(const sycl::detail::string &strn) noexcept - : str(strn.c_str()) -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - , - len(strlen(strn.c_str())) -#endif - { - } + : str(strn.c_str()), len(strlen(strn.c_str())) {} constexpr string_view &operator=(string_view &&strn) noexcept = default; string_view &operator=(const string_view &strn) noexcept = default; constexpr string_view &operator=(std::string_view strn) noexcept { str = strn.data(); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES len = strn.size(); -#endif return *this; } string_view &operator=(const sycl::detail::string &strn) noexcept { str = strn.c_str(); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES len = strlen(strn.c_str()); -#endif return *this; } @@ -71,11 +53,7 @@ class string_view { constexpr operator std::string_view() const noexcept { if (str == nullptr) return std::string_view{}; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES return std::string_view(str, len); -#else - return std::string_view(str); -#endif } }; diff --git a/sycl/include/sycl/device.hpp b/sycl/include/sycl/device.hpp index 381f537df3c4d..a2836228704f5 100644 --- a/sycl/include/sycl/device.hpp +++ b/sycl/include/sycl/device.hpp @@ -66,6 +66,8 @@ enum class peer_access { /// \ingroup sycl_api class __SYCL_STANDALONE_DEBUG __SYCL_EXPORT device : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a SYCL device instance using the default device. device(); @@ -370,17 +372,6 @@ class __SYCL_STANDALONE_DEBUG __SYCL_EXPORT device ur_native_handle_t getNative() const; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/event.hpp b/sycl/include/sycl/event.hpp index 98e722e185bcc..0791ee0ecc650 100644 --- a/sycl/include/sycl/event.hpp +++ b/sycl/include/sycl/event.hpp @@ -42,6 +42,8 @@ class event_impl; /// /// \ingroup sycl_api class __SYCL_EXPORT event : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a ready SYCL event. /// @@ -144,17 +146,6 @@ class __SYCL_EXPORT event : public detail::OwnerLessBase { std::shared_ptr impl; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/ext/oneapi/bindless_images_memory.hpp b/sycl/include/sycl/ext/oneapi/bindless_images_memory.hpp index 88c5bdbb59c74..88e480684e4c1 100644 --- a/sycl/include/sycl/ext/oneapi/bindless_images_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/bindless_images_memory.hpp @@ -58,6 +58,7 @@ class image_mem_impl { /// A class that represents image memory class __SYCL_EXPORT image_mem { + friend sycl::detail::ImplUtils; using raw_handle_type = image_mem_handle; public: @@ -93,10 +94,6 @@ class __SYCL_EXPORT image_mem { protected: std::shared_ptr impl; - - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); }; /// Direction to copy data from bindless image handle diff --git a/sycl/include/sycl/ext/oneapi/experimental/async_alloc/memory_pool.hpp b/sycl/include/sycl/ext/oneapi/experimental/async_alloc/memory_pool.hpp index d21c32a6945b4..3fc86d245229f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/async_alloc/memory_pool.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/async_alloc/memory_pool.hpp @@ -24,6 +24,8 @@ class memory_pool_impl; /// Memory pool class __SYCL_EXPORT memory_pool { + friend sycl::detail::ImplUtils; + public: template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template pool_properties stripProps(Properties props) { pool_properties poolProps{}; if constexpr (decltype(props)::template has_property()) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp index e0b8fbb861e0a..fc9638c382cc5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp @@ -10,6 +10,7 @@ #define SYCL_EXT_ONEAPI_CUDA_TEX_CACHE_READ 1 +#include #include #if defined(_WIN32) || defined(_WIN64) diff --git a/sycl/include/sycl/ext/oneapi/experimental/free_function_traits.hpp b/sycl/include/sycl/ext/oneapi/experimental/free_function_traits.hpp index 2b5d1f4190d21..f399c380fd5f8 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/free_function_traits.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/free_function_traits.hpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #pragma once +#include +#include namespace sycl { inline namespace _V1 { @@ -44,6 +46,26 @@ template struct is_kernel { template inline constexpr bool is_kernel_v = is_kernel::value; +namespace detail { +// A struct with special type is a struct type that contains special types +// passed as a paremeter to a free function kernel. It is decomposed into its +// consituents by the frontend which puts the relevant informaton about each of +// them into the struct below, namely offset, size and parameter kind for each +// one of them. The runtime then calls the addArg function to add each one of +// them as kernel arguments. The value bool is used to distinguish these structs +// from ordinary e.g standard layout structs. +template struct is_struct_with_special_type { + static constexpr bool value = false; + static constexpr int offsets[] = {-1}; + static constexpr int sizes[] = {-1}; + static constexpr sycl::detail::kernel_param_kind_t kinds[] = { + sycl::detail::kernel_param_kind_t::kind_invalid}; +}; + +} // namespace detail } // namespace ext::oneapi::experimental + +template struct is_device_copyable; + } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/command_graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/command_graph.hpp index b21336d608a98..29eb3e4bc9b72 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/command_graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/command_graph.hpp @@ -85,6 +85,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) { /// Graph in the modifiable state. template class command_graph : public detail::modifiable_command_graph { + friend sycl::detail::ImplUtils; + public: /// Constructor. /// @param SyclContext Context to use for graph. @@ -113,13 +115,6 @@ class command_graph : public detail::modifiable_command_graph { /// @param Impl Detail implementation class to construct object with. command_graph(const std::shared_ptr &Impl) : modifiable_command_graph(Impl) {} - - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; template <> diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/dynamic.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/dynamic.hpp index e617f2c6dc270..9bfc71e378ec0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/dynamic.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/dynamic.hpp @@ -45,6 +45,8 @@ class dynamic_command_group_impl; } // namespace detail class __SYCL_EXPORT dynamic_command_group { + friend sycl::detail::ImplUtils; + public: dynamic_command_group( const command_graph &Graph, @@ -64,15 +66,13 @@ class __SYCL_EXPORT dynamic_command_group { } private: - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - std::shared_ptr impl; }; namespace detail { class __SYCL_EXPORT dynamic_parameter_base { + friend sycl::detail::ImplUtils; + public: dynamic_parameter_base(size_t ParamSize, const void *Data); dynamic_parameter_base(); @@ -101,10 +101,6 @@ class __SYCL_EXPORT dynamic_parameter_base { void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); std::shared_ptr impl; - - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); }; class __SYCL_EXPORT dynamic_work_group_memory_base diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/executable_graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/executable_graph.hpp index e5e90b4a90046..37d589b2be6cb 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/executable_graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/executable_graph.hpp @@ -33,6 +33,8 @@ class exec_graph_impl; // Templateless executable command-graph base class. class __SYCL_EXPORT executable_command_graph : public sycl::detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// An executable command-graph is not user constructable. executable_command_graph() = delete; @@ -74,10 +76,6 @@ class __SYCL_EXPORT executable_command_graph const sycl::context &Ctx, const property_list &PropList = {}); - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - /// Creates a backend representation of the graph in \p impl member variable. void finalizeImpl(); diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp index 1b66edadf566b..8ffd73247192f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp @@ -41,6 +41,8 @@ class graph_impl; // Templateless modifiable command-graph base class. class __SYCL_EXPORT modifiable_command_graph : public sycl::detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructor. /// @param SyclContext Context to use for graph. @@ -198,15 +200,6 @@ class __SYCL_EXPORT modifiable_command_graph void print_graph(sycl::detail::string_view path, bool verbose = false) const; - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); std::shared_ptr impl; static void checkNodePropertiesAndThrow(const property_list &Properties); diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp index 91c756e32049f..0978fa4b8eb23 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp @@ -47,6 +47,8 @@ enum class node_type { /// Class representing a node in the graph, returned by command_graph::add(). class __SYCL_EXPORT node { + friend sycl::detail::ImplUtils; + public: node() = delete; @@ -81,16 +83,6 @@ class __SYCL_EXPORT node { private: node(const std::shared_ptr &Impl) : impl(Impl) {} - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - std::shared_ptr impl; }; diff --git a/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp b/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp index c7338474e50ac..aecef93473911 100644 --- a/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp +++ b/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp @@ -28,6 +28,8 @@ enum class address_access_mode : char { none = 0, read = 1, read_write = 2 }; class __SYCL_EXPORT physical_mem : public sycl::detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: physical_mem(const device &SyclDevice, const context &SyclContext, size_t NumBytes); @@ -57,14 +59,6 @@ class __SYCL_EXPORT physical_mem private: std::shared_ptr impl; - - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T sycl::detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); }; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/functional.hpp b/sycl/include/sycl/functional.hpp index e0201e0a64a40..0f7acdeb93c19 100644 --- a/sycl/include/sycl/functional.hpp +++ b/sycl/include/sycl/functional.hpp @@ -20,28 +20,9 @@ template using multiplies = std::multiplies; template using bit_and = std::bit_and; template using bit_or = std::bit_or; template using bit_xor = std::bit_xor; - -// std:logical_and/std::logical_or with a non-void type returns bool, -// sycl requires returning T. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES template struct logical_and : std::logical_and {}; template struct logical_or : std::logical_or {}; -#else -template struct logical_and { - T operator()(const T &lhs, const T &rhs) const { return lhs && rhs; } -}; - -template <> struct logical_and : std::logical_and {}; - -template struct logical_or { - T operator()(const T &lhs, const T &rhs) const { return lhs || rhs; } -}; - -template <> struct logical_or : std::logical_or {}; - -#endif - // sycl::minimum definition should be consistent with std::min template struct minimum { T operator()(const T &lhs, const T &rhs) const { diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index f73d362123272..51c7acaaaa313 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -419,6 +420,8 @@ template bool range_size_fits_in_size_t(const range &r) { /// /// \ingroup sycl_api class __SYCL_EXPORT handler { + friend sycl::detail::ImplUtils; + private: /// Constructs SYCL handler from the pre-constructed stack-allocated /// `handler_impl` (not enforced, but meaningless to do a heap allocation @@ -610,6 +613,10 @@ class __SYCL_EXPORT handler { if (!std::is_same::value && std::is_pointer::value) { addArg(detail::kernel_param_kind_t::kind_pointer, StoredArg, sizeof(T), ArgIndex); + } else if (ext::oneapi::experimental::detail::is_struct_with_special_type< + remove_cv_ref_t>::value) { + addArg(detail::kernel_param_kind_t::kind_struct_with_special_type, + StoredArg, sizeof(T), ArgIndex); } else { addArg(detail::kernel_param_kind_t::kind_std_layout, StoredArg, sizeof(T), ArgIndex); @@ -1378,7 +1385,10 @@ class __SYCL_EXPORT handler { || (!is_same_type::value && std::is_pointer_v>) // USM || is_same_type::value // Interop - || is_same_type::value; // Stream + || is_same_type::value // Stream + || + sycl::is_device_copyable_v>; // Structs that contain + // special types }; /// Sets argument for OpenCL interoperability kernels. @@ -1391,6 +1401,51 @@ class __SYCL_EXPORT handler { typename std::enable_if_t::value, void> set_arg(int ArgIndex, T &&Arg) { setArgHelper(ArgIndex, std::move(Arg)); + ++ArgIndex; + // The following concerns free function kernels only. + // if we are dealing with a struct parameter that contains special types + // inside, we call addArg for each field of the struct(special and standard + // layout included) at any nesting level using the information provided by + // the frontend with the arrays offsets, sizes, and kinds which as the name + // suggests, provide the offset, size and kind of each such field. + if constexpr (ext::oneapi::experimental::detail:: + is_struct_with_special_type>::value) { + using type = + ext::oneapi::experimental::detail::is_struct_with_special_type< + remove_cv_ref_t>; + int NumArgs = 0; + while (type::offsets[NumArgs] != -1) { + void *FieldArg = (char *)(&Arg) + type::offsets[NumArgs]; + // treat accessors separately since we have to fetch the data ptr and + // pass that to the addArg function rather than the address of the + // accessor object itself. + if (type::kinds[NumArgs] == + detail::kernel_param_kind_t::kind_accessor) { + constexpr int AccessTargetMask = 0x7ff; + const access::target target = static_cast( + type::sizes[NumArgs] & AccessTargetMask); + if (target == target::local) { + detail::LocalAccessorBaseHost *LocalAccBase = + (detail::LocalAccessorBaseHost *)(FieldArg); + setLocalAccessorArgHelper(ArgIndex + NumArgs, *LocalAccBase); + } else { + detail::AccessorBaseHost *AccBase = + (detail::AccessorBaseHost *)(FieldArg); + const detail::AccessorImplPtr &AccImpl = + detail::getSyclObjImpl(*AccBase); + detail::AccessorImplHost *Req = AccImpl.get(); + addArg(type::kinds[NumArgs], Req, type::sizes[NumArgs], + ArgIndex + NumArgs); + } + } else { + // for non-accessors, simply call addArg normally. + addArg(type::kinds[NumArgs], FieldArg, type::sizes[NumArgs], + ArgIndex + NumArgs); + } + ++NumArgs; + } + incrementArgShift(NumArgs); + } } template friend class ext::intel::experimental::pipe; - template - friend const decltype(Obj::impl) & - sycl::detail::getSyclObjImpl(const Obj &SyclObject); - /// Read from a host pipe given a host address and /// \param Name name of the host pipe to be passed into lower level runtime /// \param Ptr host pointer of host pipe as identified by address of its const @@ -3236,6 +3287,8 @@ class __SYCL_EXPORT handler { queue getQueue(); + void incrementArgShift(int Shift); + protected: /// Registers event dependencies in this command group. void depends_on(const detail::EventImplPtr &Event); diff --git a/sycl/include/sycl/image.hpp b/sycl/include/sycl/image.hpp index cff83ee1efd0b..a2dbff58c4af7 100644 --- a/sycl/include/sycl/image.hpp +++ b/sycl/include/sycl/image.hpp @@ -170,6 +170,7 @@ inline image_channel_order FormatChannelOrder(image_format Format) { class __SYCL_EXPORT image_plain { protected: image_plain(const std::shared_ptr &Impl) : impl{Impl} {} + friend sycl::detail::ImplUtils; image_plain(image_channel_order Order, image_channel_type Type, const range<3> &Range, @@ -347,6 +348,7 @@ class image_common : public image_plain { template class unsampled_image_common : public image_common { private: + friend sycl::detail::ImplUtils; using common_base = typename detail::image_common; protected: @@ -427,6 +429,7 @@ class unsampled_image_common : public image_common { template class image : public detail::unsampled_image_common { private: + friend sycl::detail::ImplUtils; using common_base = typename detail::unsampled_image_common; @@ -694,10 +697,6 @@ class image : public detail::unsampled_image_common { make_image(const backend_input_t> &BackendObject, const context &TargetContext, event AvailableEvent); - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - template @@ -713,6 +712,7 @@ class unsampled_image : public detail::unsampled_image_common, public detail::OwnerLessBase> { private: + friend sycl::detail::ImplUtils; using common_base = typename detail::unsampled_image_common; @@ -981,18 +981,6 @@ class unsampled_image } private: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend class host_unsampled_image_accessor; @@ -1006,6 +994,7 @@ class sampled_image : public detail::image_common, public detail::OwnerLessBase> { private: + friend sycl::detail::ImplUtils; using common_base = typename detail::image_common; sampled_image(const std::shared_ptr &Impl) @@ -1122,18 +1111,6 @@ class sampled_image } private: - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend class host_sampled_image_accessor; template diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp index bb44daf12b9c3..42e5c00dfec00 100644 --- a/sycl/include/sycl/kernel.hpp +++ b/sycl/include/sycl/kernel.hpp @@ -67,6 +67,8 @@ template struct get_kernel_name_t { /// /// \ingroup sycl_api class __SYCL_EXPORT kernel : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a SYCL kernel instance from an OpenCL cl_kernel /// @@ -250,15 +252,6 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase { std::shared_ptr impl; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); template friend auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp index 08d0428216300..b4af63c854b08 100644 --- a/sycl/include/sycl/kernel_bundle.hpp +++ b/sycl/include/sycl/kernel_bundle.hpp @@ -67,6 +67,8 @@ std::enable_if_t, kernel_id> get_kernel_id(); /// /// \ingroup sycl_api class __SYCL_EXPORT kernel_id : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: kernel_id() = delete; @@ -84,18 +86,6 @@ class __SYCL_EXPORT kernel_id : public detail::OwnerLessBase { : impl(std::move(Impl)) {} std::shared_ptr impl; - - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); }; namespace detail { @@ -104,6 +94,8 @@ class device_image_impl; // The class is used as a base for device_image for "untemplating" public // methods. class __SYCL_EXPORT device_image_plain { + friend sycl::detail::ImplUtils; + public: device_image_plain(const std::shared_ptr &Impl) : impl(Impl) {} @@ -133,18 +125,6 @@ class __SYCL_EXPORT device_image_plain { std::shared_ptr impl; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - backend ext_oneapi_get_backend_impl() const noexcept; #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) @@ -158,6 +138,8 @@ class __SYCL_EXPORT device_image_plain { template class device_image : public detail::device_image_plain, public detail::OwnerLessBase> { + friend sycl::detail::ImplUtils; + public: device_image() = delete; @@ -201,18 +183,6 @@ class device_image : public detail::device_image_plain, device_image(std::shared_ptr Impl) : device_image_plain(std::move(Impl)) {} - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - // To allow calling device_image_plain::getNative() template friend class kernel_bundle; }; @@ -223,6 +193,8 @@ using KernelBundleImplPtr = std::shared_ptr; // The class is used as a base for kernel_bundle to "untemplate" it's methods class __SYCL_EXPORT kernel_bundle_plain { + friend sycl::detail::ImplUtils; + public: kernel_bundle_plain(const detail::KernelBundleImplPtr &Impl) : impl(std::move(Impl)) {} @@ -324,6 +296,8 @@ class __SYCL_EXPORT kernel_bundle_plain { template class kernel_bundle : public detail::kernel_bundle_plain, public detail::OwnerLessBase> { + friend sycl::detail::ImplUtils; + public: using device_image_iterator = const device_image *; @@ -583,17 +557,6 @@ class kernel_bundle : public detail::kernel_bundle_plain, kernel_bundle(detail::KernelBundleImplPtr Impl) : kernel_bundle_plain(std::move(Impl)) {} - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend auto get_native(const kernel_bundle &Obj) -> backend_return_t>; diff --git a/sycl/include/sycl/platform.hpp b/sycl/include/sycl/platform.hpp index ccb1e795610cf..6197daa74427a 100644 --- a/sycl/include/sycl/platform.hpp +++ b/sycl/include/sycl/platform.hpp @@ -63,6 +63,8 @@ class filter_selector; /// /// \ingroup sycl_api class __SYCL_EXPORT platform : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a SYCL platform using the default device. platform(); @@ -210,16 +212,6 @@ class __SYCL_EXPORT platform : public detail::OwnerLessBase { platform(const device &Device); - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - template friend auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 459bb522a8553..1e02a5fd930bc 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -295,6 +295,8 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, /// /// \ingroup sycl_api class __SYCL_EXPORT queue : public detail::OwnerLessBase { + friend sycl::detail::ImplUtils; + public: /// Constructs a SYCL queue instance using the device returned by an instance /// of default_selector. @@ -3751,16 +3753,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::shared_ptr impl; queue(std::shared_ptr impl) : impl(impl) {} - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); - template - friend T detail::createSyclObjFromImpl( - std::add_rvalue_reference_t ImplObj); - template - friend T detail::createSyclObjFromImpl( - std::add_lvalue_reference_t ImplObj); - template friend auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/sampler.hpp b/sycl/include/sycl/sampler.hpp index 6dd69ff4ac734..feb3699976439 100644 --- a/sycl/include/sycl/sampler.hpp +++ b/sycl/include/sycl/sampler.hpp @@ -63,6 +63,8 @@ class sampler_impl; /// /// \ingroup sycl_api class __SYCL_EXPORT __SYCL_SPECIAL_CLASS __SYCL_TYPE(sampler) sampler { + friend sycl::detail::ImplUtils; + public: sampler(coordinate_normalization_mode normalizationMode, addressing_mode addressingMode, filtering_mode filteringMode, @@ -119,9 +121,6 @@ class __SYCL_EXPORT __SYCL_SPECIAL_CLASS __SYCL_TYPE(sampler) sampler { private: #else std::shared_ptr impl; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); #endif template { + friend sycl::detail::ImplUtils; + private: #ifndef __SYCL_DEVICE_ONLY__ // Constructor for recreating a stream. @@ -909,9 +911,6 @@ class __SYCL_EXPORT __SYCL_SPECIAL_CLASS __SYCL_TYPE(stream) stream char padding[sizeof(std::shared_ptr)]; #else std::shared_ptr impl; - template - friend const decltype(Obj::impl) & - detail::getSyclObjImpl(const Obj &SyclObject); #endif // NOTE: Some members are required for reconstructing the stream, but are not diff --git a/sycl/source/detail/common.cpp b/sycl/source/detail/common.cpp index f05b37d9986ce..1c7669bbf3652 100644 --- a/sycl/source/detail/common.cpp +++ b/sycl/source/detail/common.cpp @@ -24,18 +24,10 @@ static thread_local detail::code_location GCodeLocTLS = {}; /// check and see if code location object is available. If not, continue with /// instrumentation as needed tls_code_loc_t::tls_code_loc_t() -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES : CodeLocTLSRef(GCodeLocTLS), // Check TLS to see if a previously stashed code_location object is // available; if so, we are in a local scope. - MLocalScope(CodeLocTLSRef.fileName() && CodeLocTLSRef.functionName()) -#else - : // Check TLS to see if a previously stashed code_location object is - // available; if so, we are in a local scope. - MLocalScope(GCodeLocTLS.fileName() && GCodeLocTLS.functionName()) -#endif // __INTEL_PREVIEW_BREAKING_CHANGES -{ -} + MLocalScope(CodeLocTLSRef.fileName() && CodeLocTLSRef.functionName()) {} ur_code_location_t codeLocationCallback(void *) { ur_code_location_t codeloc; @@ -53,7 +45,6 @@ ur_code_location_t codeLocationCallback(void *) { /// location has been stashed in the TLS at a higher level. If not, we have the /// code location information that must be active for the current calling scope. tls_code_loc_t::tls_code_loc_t(const detail::code_location &CodeLoc) -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES : CodeLocTLSRef(GCodeLocTLS), // Check TLS to see if a previously stashed code_location object is // available; if so, then don't overwrite the previous information as we @@ -62,36 +53,17 @@ tls_code_loc_t::tls_code_loc_t(const detail::code_location &CodeLoc) if (!MLocalScope) // Update the TLS information with the code_location information CodeLocTLSRef = CodeLoc; -#else - : // Check TLS to see if a previously stashed code_location object is - // available; if so, then don't overwrite the previous information as we - // are still in scope of the instrumented function. - MLocalScope(GCodeLocTLS.fileName() && GCodeLocTLS.functionName()) { - if (!MLocalScope) - // Update the TLS information with the code_location information - GCodeLocTLS = CodeLoc; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES } /// @brief If we are the top lovel scope, reset the code location info tls_code_loc_t::~tls_code_loc_t() { // Only reset the TLS data if the top level function is going out of scope if (!MLocalScope) { -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES CodeLocTLSRef = {}; -#else - GCodeLocTLS = {}; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES } } -const detail::code_location &tls_code_loc_t::query() { -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - return CodeLocTLSRef; -#else - return GCodeLocTLS; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES -} +const detail::code_location &tls_code_loc_t::query() { return CodeLocTLSRef; } } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 59044782a43ce..053597fbc857e 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -128,6 +128,11 @@ context_impl::~context_impl() { if (DGEntry != nullptr) DGEntry->removeAssociatedResources(this); } + // Free all profile counter USM allocations associated with this context. + for (DeviceGlobalMapEntry *DGEntry : + detail::ProgramManager::getInstance() + .getProfileCounterDeviceGlobalEntries(this)) + DGEntry->cleanupProfileCounter(this); MCachedLibPrograms.clear(); // TODO catch an exception and put it to list of asynchronous exceptions getAdapter().call_nocheck(MContext); diff --git a/sycl/source/detail/device_global_map.hpp b/sycl/source/detail/device_global_map.hpp index ea255bacda6ba..46682b06aff5f 100644 --- a/sycl/source/detail/device_global_map.hpp +++ b/sycl/source/detail/device_global_map.hpp @@ -75,7 +75,10 @@ class DeviceGlobalMap { // cannot be set until registration happens. auto EntryUPtr = std::make_unique( DeviceGlobal->Name, Img, TypeSize, DeviceImageScopeDecorated); - MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr)); + auto NewEntry = + MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr)); + if (NewEntry.first->second->isProfileCounter()) + MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get()); } } } @@ -114,6 +117,8 @@ class DeviceGlobalMap { auto EntryUPtr = std::make_unique(UniqueId, DeviceGlobalPtr); auto NewEntry = MDeviceGlobals.emplace(UniqueId, std::move(EntryUPtr)); + if (NewEntry.first->second->isProfileCounter()) + MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get()); MPtr2DeviceGlobal.insert({DeviceGlobalPtr, NewEntry.first->second.get()}); } @@ -154,6 +159,11 @@ class DeviceGlobalMap { } } + std::vector getProfileCounterEntries() { + std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex); + return MProfileCounterDeviceGlobals; + } + const std::unordered_map getPointerMap() const { return MPtr2DeviceGlobal; @@ -177,6 +187,9 @@ class DeviceGlobalMap { MDeviceGlobals; std::unordered_map MPtr2DeviceGlobal; + // List of profile counter device globals. + std::vector MProfileCounterDeviceGlobals; + /// Protects MDeviceGlobals and MPtr2DeviceGlobal. std::mutex MDeviceGlobalsMutex; }; diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp index d31d9e8999348..cc92acbf6d581 100644 --- a/sycl/source/detail/device_global_map_entry.cpp +++ b/sycl/source/detail/device_global_map_entry.cpp @@ -56,6 +56,93 @@ OwnedUrEvent DeviceGlobalUSMMem::getInitEvent(adapter_impl &Adapter) { } } +bool DeviceGlobalMapEntry::isAvailableInContext( + const context_impl *CtxImpl) const { + std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; + return std::any_of( + MDeviceToUSMPtrMap.begin(), MDeviceToUSMPtrMap.end(), + [CtxImpl](const auto &It) { return It.first.second == CtxImpl; }); +} + +bool DeviceGlobalMapEntry::isProfileCounter() const { + constexpr std::string_view CounterPrefix = "__profc_"; + return std::string_view{MUniqueId}.substr(0, CounterPrefix.size()) == + CounterPrefix; +} + +// __sycl_increment_profile_counters must be defined as a weak symbol so that +// the program will link even if the profiling runtime is not linked in. When +// compiling with MSVC there is no weak attribute, so we use a pragma comment +// and default function to achieve the same effect. When compiling with Apple +// Clang, profiling is unsupported and the function definition is empty. +#ifdef _MSC_VER +extern "C" void +__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters, + const std::uint64_t *Increments); +extern "C" void +__sycl_increment_profile_counters_default(std::uint64_t FnHash, + std::size_t NumCounters, + const std::uint64_t *Increments) { + (void)FnHash; + (void)NumCounters; + (void)Increments; +} +#pragma comment( \ + linker, \ + "/alternatename:__sycl_increment_profile_counters=__sycl_increment_profile_counters_default") +#elif defined(__clang__) && defined(__apple_build_version__) +extern "C" void +__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters, + const std::uint64_t *Increments) { + (void)FnHash; + (void)NumCounters; + (void)Increments; +} +#else +extern "C" void __attribute__((weak)) +__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters, + const std::uint64_t *Increments); +#endif + +void DeviceGlobalMapEntry::cleanupProfileCounter(context_impl *CtxImpl) { + std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; + assert(isProfileCounter() && "Not a profile counter device global."); + const std::size_t NumCounters = MDeviceGlobalTSize / sizeof(std::uint64_t); + const std::uint64_t FnHash = [&] { + constexpr size_t PrefixSize = std::string_view{"__profc_"}.size(); + constexpr int DecimalBase = 10; + return std::strtoull(MUniqueId.substr(PrefixSize).c_str(), nullptr, + DecimalBase); + }(); + for (const device_impl &Device : CtxImpl->getDevices()) { + auto USMPtrIt = MDeviceToUSMPtrMap.find({&Device, CtxImpl}); + if (USMPtrIt == MDeviceToUSMPtrMap.end()) + continue; + + // Get the increments from the USM pointer. + DeviceGlobalUSMMem &USMMem = USMPtrIt->second; + std::vector Increments(NumCounters); + const std::uint64_t *Counters = static_cast(USMMem.MPtr); + for (std::size_t I = 0; I < NumCounters; ++I) + Increments[I] = Counters[I]; + + // Call the weak symbol to update the profile counters. + if (&__sycl_increment_profile_counters) + __sycl_increment_profile_counters(FnHash, Increments.size(), + Increments.data()); + + // Free the USM memory and release the event if it exists. + detail::usm::freeInternal(USMMem.MPtr, CtxImpl); + if (USMMem.MInitEvent != nullptr) + CtxImpl->getAdapter().call(USMMem.MInitEvent); + + // Set to nullptr to avoid double free. + USMMem.MPtr = nullptr; + USMMem.MInitEvent = nullptr; + MDeviceToUSMPtrMap.erase(USMPtrIt); + } +} + DeviceGlobalUSMMem & DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { assert(!MIsDeviceImageScopeDecorated && @@ -70,7 +157,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { return DGUSMPtr->second; void *NewDGUSMPtr = detail::usm::alignedAllocInternal( - 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device); + 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, + isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device); auto NewAllocIt = MDeviceToUSMPtrMap.emplace( std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl), @@ -85,12 +173,12 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { std::lock_guard Lock(NewAlloc.MInitEventMutex); ur_event_handle_t InitEvent = nullptr; if (MDeviceGlobalPtr) { - // C++ guarantees members appear in memory in the order they are declared, - // so since the member variable that contains the initial contents of the - // device_global is right after the usm_ptr member variable we can do - // some pointer arithmetic to memcopy over this value to the usm_ptr. This - // value inside of the device_global will be zero-initialized if it was - // not given a value on construction. + // C++ guarantees members appear in memory in the order they are + // declared, so since the member variable that contains the initial + // contents of the device_global is right after the usm_ptr member + // variable we can do some pointer arithmetic to memcopy over this + // value to the usm_ptr. This value inside of the device_global will + // be zero-initialized if it was not given a value on construction. MemoryManager::copy_usm( reinterpret_cast( reinterpret_cast(MDeviceGlobalPtr) + @@ -98,8 +186,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr, std::vector{}, &InitEvent); } else { - // For SYCLBIN device globals we do not have a host pointer to copy from, - // so instead we fill the USM memory with 0's. + // For SYCLBIN device globals we do not have a host pointer to copy + // from, so instead we fill the USM memory with 0's. MemoryManager::fill_usm(NewAlloc.MPtr, QueueImpl, MDeviceGlobalTSize, {static_cast(0)}, {}, &InitEvent); } @@ -107,8 +195,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { } // Only device globals with host variables need to be registered with the - // context. The rest will be managed by their kernel bundles and cleaned up - // accordingly. + // context. The rest will be managed by their kernel bundles and cleaned + // up accordingly. if (MDeviceGlobalPtr) CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr); return NewAlloc; @@ -128,7 +216,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { return DGUSMPtr->second; void *NewDGUSMPtr = detail::usm::alignedAllocInternal( - 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device); + 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, + isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device); auto NewAllocIt = MDeviceToUSMPtrMap.emplace( std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl), @@ -139,20 +228,20 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { NewAlloc.MAllocatingContext = CtxImpl.shared_from_this(); if (MDeviceGlobalPtr) { - // C++ guarantees members appear in memory in the order they are declared, - // so since the member variable that contains the initial contents of the - // device_global is right after the usm_ptr member variable we can do - // some pointer arithmetic to memcopy over this value to the usm_ptr. This - // value inside of the device_global will be zero-initialized if it was not - // given a value on construction. + // C++ guarantees members appear in memory in the order they are + // declared, so since the member variable that contains the initial + // contents of the device_global is right after the usm_ptr member + // variable we can do some pointer arithmetic to memcopy over this value + // to the usm_ptr. This value inside of the device_global will be + // zero-initialized if it was not given a value on construction. MemoryManager::context_copy_usm( reinterpret_cast( reinterpret_cast(MDeviceGlobalPtr) + sizeof(MDeviceGlobalPtr)), &CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr); } else { - // For SYCLBIN device globals we do not have a host pointer to copy from, - // so instead we fill the USM memory with 0's. + // For SYCLBIN device globals we do not have a host pointer to copy + // from, so instead we fill the USM memory with 0's. std::vector ImmBuff(MDeviceGlobalTSize, static_cast(0)); MemoryManager::context_copy_usm(ImmBuff.data(), &CtxImpl, @@ -160,8 +249,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { } // Only device globals with host variables need to be registered with the - // context. The rest will be managed by their kernel bundles and cleaned up - // accordingly. + // context. The rest will be managed by their kernel bundles and cleaned + // up accordingly. if (MDeviceGlobalPtr) CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr); return NewAlloc; diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp index 9ff30938cbf34..4538dcf4bc1eb 100644 --- a/sycl/source/detail/device_global_map_entry.hpp +++ b/sycl/source/detail/device_global_map_entry.hpp @@ -110,6 +110,15 @@ struct DeviceGlobalMapEntry { MIsDeviceImageScopeDecorated = IsDeviceImageScopeDecorated; } + // Checks if the device_global is available in the given context. + bool isAvailableInContext(const context_impl *CtxImpl) const; + + // Returns true if the device_global is a profile counter. + bool isProfileCounter() const; + + // Cleans up a profile counter device global. + void cleanupProfileCounter(context_impl *CtxImpl); + // Gets or allocates USM memory for a device_global. DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl); @@ -135,7 +144,7 @@ struct DeviceGlobalMapEntry { std::map, DeviceGlobalUSMMem> MDeviceToUSMPtrMap; - std::mutex MDeviceToUSMPtrMapMutex; + mutable std::mutex MDeviceToUSMPtrMapMutex; }; } // namespace detail diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 6a9736d3af32c..5739b85ba4d91 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -467,62 +467,6 @@ device_impl::getImmediateProgressGuarantee( return forward_progress_guarantee::weakly_parallel; } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -#define EXPORT_GET_INFO(PARAM) \ - template <> \ - __SYCL_EXPORT PARAM::return_type device_impl::get_info() const { \ - return get_info_abi_workaround(); \ - } - -// clang-format off -EXPORT_GET_INFO(ext::intel::info::device::device_id) -EXPORT_GET_INFO(ext::intel::info::device::pci_address) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_count) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_simd_width) -EXPORT_GET_INFO(ext::intel::info::device::gpu_slices) -EXPORT_GET_INFO(ext::intel::info::device::gpu_subslices_per_slice) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_count_per_subslice) -EXPORT_GET_INFO(ext::intel::info::device::gpu_hw_threads_per_eu) -EXPORT_GET_INFO(ext::intel::info::device::max_mem_bandwidth) -EXPORT_GET_INFO(ext::intel::info::device::uuid) -EXPORT_GET_INFO(ext::intel::info::device::free_memory) -EXPORT_GET_INFO(ext::intel::info::device::memory_clock_rate) -EXPORT_GET_INFO(ext::intel::info::device::memory_bus_width) -EXPORT_GET_INFO(ext::intel::info::device::max_compute_queue_indices) -EXPORT_GET_INFO(ext::intel::esimd::info::device::has_2d_block_io_support) -EXPORT_GET_INFO(ext::intel::info::device::current_clock_throttle_reasons) -EXPORT_GET_INFO(ext::intel::info::device::fan_speed) -EXPORT_GET_INFO(ext::intel::info::device::min_power_limit) -EXPORT_GET_INFO(ext::intel::info::device::max_power_limit) - -EXPORT_GET_INFO(ext::codeplay::experimental::info::device::supports_fusion) -EXPORT_GET_INFO(ext::codeplay::experimental::info::device::max_registers_per_work_group) - -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_global_work_groups) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<1>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<2>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<3>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::sub_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::sub_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::architecture) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::matrix_combinations) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::image_row_pitch_align) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_row_pitch) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_width) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_height) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::mipmap_max_anisotropy) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::component_devices) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::composite_device) -EXPORT_GET_INFO(ext::oneapi::info::device::num_compute_units) -// clang-format on - -#undef EXPORT_GET_INFO -#endif - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index a52fd65353a10..020aae35bb9a6 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -379,13 +379,7 @@ class device_impl : public std::enable_shared_from_this { struct InfoInitializer { template static void init(device_impl &device, typename Desc::return_type &value) { - value = device. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - get_info -#else - get_info_abi_workaround -#endif - (); + value = device.get_info(); } }; @@ -562,23 +556,8 @@ class device_impl : public std::enable_shared_from_this { /// /// \return device info of type described in Table 4.20. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES template decltype(auto) get_info() const { -#define CALL_GET_INFO get_info -#else - // We've been exporting - // `device_impl::get_info::info::device::` for no - // reason. Have to keep doing that until next ABI breaking window. Also, old - // gcc doesn't allow in-class specializations, so they have to go out-of-class - // which happens later then implicit instantiatons of delegating to - // `get_info`. As such, all such calls have to go through - // `get_info_abi_workaround` for which we need this ugly macro: -#define CALL_GET_INFO get_info_abi_workaround - template typename Param::return_type get_info() const; - template - decltype(auto) get_info_abi_workaround() const { -#endif using execution_scope = ext::oneapi::experimental::execution_scope; // With the return type of this function being automatically @@ -614,12 +593,12 @@ class device_impl : public std::enable_shared_from_this { } CASE(info::device::max_work_item_sizes<2>) { range<3> r3 = - CALL_GET_INFO, DependentFalse>(); + get_info, DependentFalse>(); return range<2>{r3[1], r3[2]}; } CASE(info::device::max_work_item_sizes<1>) { range<3> r3 = - CALL_GET_INFO, DependentFalse>(); + get_info, DependentFalse>(); return range<1>{r3[2]}; } @@ -710,8 +689,7 @@ class device_impl : public std::enable_shared_from_this { ';'); } CASE(info::device::built_in_kernel_ids) { - auto names = - CALL_GET_INFO(); + auto names = get_info(); std::vector ids; ids.reserve(names.size()); @@ -893,25 +871,25 @@ class device_impl : public std::enable_shared_from_this { CASE(info::device::ext_oneapi_max_global_work_groups) { // Deprecated alias. - return CALL_GET_INFO< + return get_info< ext::oneapi::experimental::info::device::max_global_work_groups, DependentFalse>(); } CASE(info::device::ext_oneapi_max_work_groups_1d) { // Deprecated alias. - return CALL_GET_INFO< + return get_info< ext::oneapi::experimental::info::device::max_work_groups<1>, DependentFalse>(); } CASE(info::device::ext_oneapi_max_work_groups_2d) { // Deprecated alias. - return CALL_GET_INFO< + return get_info< ext::oneapi::experimental::info::device::max_work_groups<2>, DependentFalse>(); } CASE(info::device::ext_oneapi_max_work_groups_3d) { // Deprecated alias. - return CALL_GET_INFO< + return get_info< ext::oneapi::experimental::info::device::max_work_groups<3>, DependentFalse>(); } @@ -936,7 +914,7 @@ class device_impl : public std::enable_shared_from_this { return static_cast((std::numeric_limits::max)()); } CASE(ext::oneapi::experimental::info::device::max_work_groups<3>) { - size_t Limit = CALL_GET_INFO< + size_t Limit = get_info< ext::oneapi::experimental::info::device::max_global_work_groups, DependentFalse>(); @@ -949,15 +927,15 @@ class device_impl : public std::enable_shared_from_this { std::min(Limit, result[0])); } CASE(ext::oneapi::experimental::info::device::max_work_groups<2>) { - id<3> max_3d = CALL_GET_INFO< - ext::oneapi::experimental::info::device::max_work_groups<3>, - DependentFalse>(); + id<3> max_3d = + get_info, + DependentFalse>(); return id<2>{max_3d[1], max_3d[2]}; } CASE(ext::oneapi::experimental::info::device::max_work_groups<1>) { - id<3> max_3d = CALL_GET_INFO< - ext::oneapi::experimental::info::device::max_work_groups<3>, - DependentFalse>(); + id<3> max_3d = + get_info, + DependentFalse>(); return id<1>{max_3d[2]}; } @@ -1480,7 +1458,7 @@ class device_impl : public std::enable_shared_from_this { arch::intel_gpu_dg2_g12, arch::intel_gpu_bmg_g21, arch::intel_gpu_bmg_g31, arch::intel_gpu_lnl_m, arch::intel_gpu_arl_h, arch::intel_gpu_ptl_h, - arch::intel_gpu_ptl_u, + arch::intel_gpu_ptl_u, arch::intel_gpu_wcl, }; try { return std::any_of( @@ -1493,7 +1471,7 @@ class device_impl : public std::enable_shared_from_this { } } CASE(ext_oneapi_is_composite) { - auto components = CALL_GET_INFO< + auto components = get_info< sycl::ext::oneapi::experimental::info::device::component_devices>(); // Any device with ext_oneapi_is_composite aspect will have at least two // constituent component devices. @@ -1650,12 +1628,7 @@ class device_impl : public std::enable_shared_from_this { extOneapiArchitectureIs(ext::oneapi::experimental::architecture Arch) const { return Arch == -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - get_info -#else - get_info_abi_workaround -#endif - (); + get_info(); } bool extOneapiArchitectureIs( @@ -1666,12 +1639,7 @@ class device_impl : public std::enable_shared_from_this { get_category_max_architecture(Category); if (CategoryMinArch.has_value() && CategoryMaxArch.has_value()) { auto Arch = -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - get_info -#else - get_info_abi_workaround -#endif - (); + get_info(); return CategoryMinArch <= Arch && Arch <= CategoryMaxArch; } return false; @@ -1969,7 +1937,7 @@ class device_impl : public std::enable_shared_from_this { // sycl_ext_oneapi_device_architecture, the runtime exception is // omitted, and std::nullopt is returned. try { - return CALL_GET_INFO< + return get_info< ext::oneapi::experimental::info::device::architecture>(); } catch (sycl::exception &e) { if (e.code() != errc::runtime) @@ -2031,7 +1999,8 @@ class device_impl : public std::enable_shared_from_this { (architecture::intel_gpu_bmg_g31 == DeviceArch) || (architecture::intel_gpu_lnl_m == DeviceArch) || (architecture::intel_gpu_ptl_h == DeviceArch) || - (architecture::intel_gpu_ptl_u == DeviceArch)) { + (architecture::intel_gpu_ptl_u == DeviceArch) || + (architecture::intel_gpu_wcl == DeviceArch)) { std::vector pvc_combs = { {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::uint8, matrix_type::sint32, matrix_type::sint32}, @@ -2357,66 +2326,6 @@ class devices_range : public iterator_range { } }; -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -template -typename Param::return_type device_impl::get_info() const { - return get_info_abi_workaround(); -} - -#define EXPORT_GET_INFO(PARAM) \ - template <> \ - __SYCL_EXPORT PARAM::return_type device_impl::get_info() const; - -// clang-format off -EXPORT_GET_INFO(ext::intel::info::device::device_id) -EXPORT_GET_INFO(ext::intel::info::device::pci_address) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_count) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_simd_width) -EXPORT_GET_INFO(ext::intel::info::device::gpu_slices) -EXPORT_GET_INFO(ext::intel::info::device::gpu_subslices_per_slice) -EXPORT_GET_INFO(ext::intel::info::device::gpu_eu_count_per_subslice) -EXPORT_GET_INFO(ext::intel::info::device::gpu_hw_threads_per_eu) -EXPORT_GET_INFO(ext::intel::info::device::max_mem_bandwidth) -EXPORT_GET_INFO(ext::intel::info::device::uuid) -EXPORT_GET_INFO(ext::intel::info::device::free_memory) -EXPORT_GET_INFO(ext::intel::info::device::memory_clock_rate) -EXPORT_GET_INFO(ext::intel::info::device::memory_bus_width) -EXPORT_GET_INFO(ext::intel::info::device::max_compute_queue_indices) -EXPORT_GET_INFO(ext::intel::esimd::info::device::has_2d_block_io_support) -EXPORT_GET_INFO(ext::intel::info::device::current_clock_throttle_reasons) -EXPORT_GET_INFO(ext::intel::info::device::fan_speed) -EXPORT_GET_INFO(ext::intel::info::device::min_power_limit) -EXPORT_GET_INFO(ext::intel::info::device::max_power_limit) - -EXPORT_GET_INFO(ext::codeplay::experimental::info::device::supports_fusion) -EXPORT_GET_INFO(ext::codeplay::experimental::info::device::max_registers_per_work_group) - -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_global_work_groups) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<1>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<2>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_work_groups<3>) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::sub_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::sub_group_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::work_item_progress_capabilities) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::architecture) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::matrix_combinations) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::image_row_pitch_align) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_row_pitch) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_width) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::max_image_linear_height) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::mipmap_max_anisotropy) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::component_devices) -EXPORT_GET_INFO(ext::oneapi::experimental::info::device::composite_device) -EXPORT_GET_INFO(ext::oneapi::info::device::num_compute_units) -// clang-format on - -#undef EXPORT_GET_INFO -#endif - -#undef CALL_GET_INFO } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/kernel_data.cpp b/sycl/source/detail/kernel_data.cpp index 2fa9a964ef36f..c01e58064e495 100644 --- a/sycl/source/detail/kernel_data.cpp +++ b/sycl/source/detail/kernel_data.cpp @@ -114,6 +114,10 @@ void KernelData::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, switch (Kind) { case kernel_param_kind_t::kind_std_layout: + case kernel_param_kind_t::kind_struct_with_special_type: { + addArg(Kind, Ptr, Size, Index + IndexShift); + break; + } case kernel_param_kind_t::kind_pointer: { addArg(Kind, Ptr, Size, Index + IndexShift); break; @@ -346,6 +350,10 @@ void KernelData::extractArgsAndReqsFromLambda() { } } +void KernelData::incrementArgShift(int Shift) { MArgShift += Shift; } + +int KernelData::getArgShift() const { return MArgShift; } + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 5380e76efebbd..d37f1fadcdb06 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -66,7 +66,10 @@ class KernelData { MArgs.emplace_back(std::forward(args)...); } - void clearArgs() { MArgs.clear(); } + void clearArgs() { + MArgs.clear(); + MArgShift = 0; + } detail::NDRDescT &getNDRDesc() & { return MNDRDesc; } @@ -290,6 +293,10 @@ class KernelData { void extractArgsAndReqsFromLambda(); + void incrementArgShift(int Shift); + + int getArgShift() const; + private: // Storage for any SYCL Graph dynamic parameters which have been flagged for // registration in the CG, along with the argument index for the parameter. @@ -313,6 +320,14 @@ class KernelData { // A pointer to device kernel information. Cached on the application side in // headers or retrieved from program manager. DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; + + // Certain arguments such as structs that contain SYCL special types entail + // several hidden set_arg calls for every set_arg called by the user. This + // shift is required to make sure the following arguments set by the user have + // the correct index. It keeps track of how many of these hidden set_arg calls + // have been made so far. The user cannot possibly know this, hence we need to + // keep track of this information. + int MArgShift = 0; }; } // namespace detail diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 835e31952d5d0..8abbe0fdc261f 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -402,12 +402,11 @@ static void appendCompileOptionsFromImage(std::string &CompileOpts, auto ColonPos = OptValue.find(":"); auto Device = OptValue.substr(0, ColonPos); std::string BackendStrToAdd; - bool IsPVC = - std::all_of(Devs.begin(), Devs.end(), [&](device_impl &Dev) { - return IsIntelGPU && - (Dev.get_info() & - 0xFF00) == 0x0B00; - }); + bool IsPVC = std::all_of(Devs.begin(), Devs.end(), [&](device_impl &Dev) { + return IsIntelGPU && + (Dev.get_info() & 0xFF00) == + 0x0B00; + }); // Currently 'pvc' is the only supported device. if (Device == "pvc" && IsPVC) BackendStrToAdd = " " + OptValue.substr(ColonPos + 1) + " "; @@ -2434,6 +2433,20 @@ std::vector ProgramManager::getDeviceGlobalEntries( return FoundEntries; } +std::vector +ProgramManager::getProfileCounterDeviceGlobalEntries( + const context_impl *CtxImpl) { + std::vector ProfileCounters = + ProgramManager::getInstance().m_DeviceGlobals.getProfileCounterEntries(); + const auto NewEnd = + std::remove_if(ProfileCounters.begin(), ProfileCounters.end(), + [CtxImpl](DeviceGlobalMapEntry *DGEntry) { + return !DGEntry->isAvailableInContext(CtxImpl); + }); + ProfileCounters.erase(NewEnd, ProfileCounters.end()); + return ProfileCounters; +} + void ProgramManager::addOrInitHostPipeEntry(const void *HostPipePtr, const char *UniqueId) { std::lock_guard HostPipesGuard(m_HostPipesMutex); diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 98707d5e30d94..c7a1a0aafb854 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -304,6 +304,11 @@ class ProgramManager { std::vector getDeviceGlobalEntries(const std::vector &UniqueIds, bool ExcludeDeviceImageScopeDecorated = false); + + // The function gets all device_global entries that are profile counters. + std::vector + getProfileCounterDeviceGlobalEntries(const context_impl *CtxImpl); + // The function inserts or initializes a host_pipe entry into the // host_pipe map. void addOrInitHostPipeEntry(const void *HostPipePtr, const char *UniqueId); diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 84a8722c96e76..48ad823e06205 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -49,10 +49,10 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, return WGSize; } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -// Inline this helper: -#endif -uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) { +// Returns the estimated number of physical threads on the device associated +// with the given queue. +__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) { + const device_impl &Dev = getSyclObjImpl(cgh)->get_device(); uint32_t NumThreads = Dev.get_info(); // TODO: The heuristics here require additional tuning for various devices // and vendors. Also, it would be better to check vendor/generation/etc. @@ -60,33 +60,10 @@ uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) { NumThreads *= 8; return NumThreads; } -// Returns the estimated number of physical threads on the device associated -// with the given queue. -__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) { - return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device()); -} -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups( - std::shared_ptr Queue) { - // TODO: Graphs extension explicit API uses a handler with no queue attached, - // so return some value here. In the future we should have access to the - // device so can remove this. - // - // The 8 value was chosen as the hardcoded value as it is the returned - // value for sycl::info::device::max_compute_units on - // Intel HD Graphics devices used as a L0 backend during development. - if (Queue == nullptr) { - return 8; - } - return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl()); -} -#endif - -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -// Inline this helper: -#endif -size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) { +__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh, + size_t LocalMemBytesPerWorkItem) { + const device_impl &Dev = getSyclObjImpl(cgh)->get_device(); size_t MaxWGSize = Dev.get_info(); size_t WGSizePerMem = MaxWGSize * 2; @@ -123,24 +100,9 @@ size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) { return WGSize; } -__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh, - size_t LocalMemBytesPerWorkItem) { - return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(), - LocalMemBytesPerWorkItem); -} -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT -size_t reduGetMaxWGSize(std::shared_ptr Queue, - size_t LocalMemBytesPerWorkItem) { - return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem); -} -#endif -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -// Inline this helper: -#endif -size_t reduGetPreferredWGSize(device_impl &Dev, - size_t LocalMemBytesPerWorkItem) { +__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh, + size_t LocalMemBytesPerWorkItem) { // The maximum WGSize returned by CPU devices is very large and does not // help the reduction implementation: since all work associated with a // work-group is typically assigned to one CPU thread, selecting a large @@ -150,6 +112,7 @@ size_t reduGetPreferredWGSize(device_impl &Dev, // behavior. using PrefWGConfig = sycl::detail::SYCLConfig< sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>; + const device_impl &Dev = getSyclObjImpl(cgh)->get_device(); if (Dev.is_cpu()) { size_t CPUMaxWGSize = PrefWGConfig::get(sycl::info::device_type::cpu); if (CPUMaxWGSize == 0) @@ -177,46 +140,8 @@ size_t reduGetPreferredWGSize(device_impl &Dev, } // Use the maximum work-group size otherwise. - return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem); -} -__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh, - size_t LocalMemBytesPerWorkItem) { - return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(), - LocalMemBytesPerWorkItem); -} -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem) { - // TODO: Graphs extension explicit API uses a handler with a null queue to - // process CGFs, in future we should have access to the device so we can - // correctly calculate this. - // - // The 32 value was chosen as the hardcoded value as it is the returned - // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on - // Intel HD Graphics devices used as a L0 backend during development. - if (Queue == nullptr) { - return 32; - } - device_impl &Dev = Queue->getDeviceImpl(); - - return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem); -} -#endif - -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT void -addCounterInit(handler &CGH, std::shared_ptr &Queue, - std::shared_ptr &Counter) { - auto EventImpl = detail::event_impl::create_device_event(*Queue); - EventImpl->setContextImpl(Queue->getContextImpl()); - EventImpl->setStateIncomplete(); - ur_event_handle_t UREvent = nullptr; - MemoryManager::fill_usm(Counter.get(), *Queue, sizeof(int), {0}, {}, - &UREvent); - EventImpl->setHandle(UREvent); - CGH.depends_on(createSyclObjFromImpl(EventImpl)); + return reduGetMaxWGSize(cgh, LocalMemBytesPerWorkItem); } -#endif __SYCL_EXPORT void verifyReductionProps(const property_list &Props) { auto CheckDataLessProperties = [](int PropertyKind) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f36b1e269e009..2664a45e94e83 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2308,6 +2308,16 @@ static void GetUrArgsBasedOnType( 0, {}}; switch (Arg.MType) { + case kernel_param_kind_t::kind_struct_with_special_type: { + ur_exp_kernel_arg_type_t Type; + Type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + ur_exp_kernel_arg_value_t Value = {}; + Value.value = {Arg.MPtr}; + UrArg.type = Type; + UrArg.size = static_cast(Arg.MSize); + UrArg.value = Value; + break; + } case kernel_param_kind_t::kind_dynamic_work_group_memory: break; case kernel_param_kind_t::kind_work_group_memory: @@ -2585,6 +2595,11 @@ static void SetArgBasedOnType( break; } + case kernel_param_kind_t::kind_struct_with_special_type: { + Adapter.call(Kernel, NextTrueIndex, + Arg.MSize, nullptr, Arg.MPtr); + break; + } case kernel_param_kind_t::kind_sampler: { sampler *SamplerPtr = (sampler *)Arg.MPtr; ur_sampler_handle_t Sampler = diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 3b8caf79ff72e..c4e29f53d99aa 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -127,13 +127,8 @@ detail::ABINeutralT_t::return_type> device::get_info_impl() const { static_assert( std::is_same_v::return_type, - decltype(impl->template -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - get_info -#else - get_info_abi_workaround -#endif - ())>); + decltype(impl->template get_info< + Param, true /* InitializingCache */>())>); return detail::convert_to_abi_neutral(impl->template get_info()); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 1cf9b00714471..1d291d7eb46d3 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1736,7 +1736,8 @@ void handler::addLifetimeSharedPtrStorage(std::shared_ptr SPtr) { void handler::addArg(detail::kernel_param_kind_t ArgKind, void *Req, int AccessTarget, int ArgIndex) { - impl->MKernelData.addArg(ArgKind, Req, AccessTarget, ArgIndex); + impl->MKernelData.addArg(ArgKind, Req, AccessTarget, + ArgIndex + impl->MKernelData.getArgShift()); } void handler::setArgsToAssociatedAccessors() { @@ -1812,6 +1813,10 @@ void handler::setDeviceKernelInfoPtr( impl->MKernelData.setDeviceKernelInfoPtr(DeviceKernelInfoPtr); } +void handler::incrementArgShift(int Shift) { + impl->MKernelData.incrementArgShift(Shift); +} + void handler::setKernelFunc(void *KernelFuncPtr) { impl->MKernelData.setKernelFunc(KernelFuncPtr); } diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index 14d19ddacaa6b..9639f0cb960d0 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -182,14 +182,7 @@ removeDuplicateDevices(const std::vector &Devs) { kernel_id get_kernel_id_impl(string_view KernelName) { return detail::ProgramManager::getInstance().getSYCLKernelID( -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - std::string( -#endif - std::string_view(KernelName) -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - ) -#endif - ); + std::string_view(KernelName)); } detail::KernelBundleImplPtr diff --git a/sycl/source/queue_v3.cpp b/sycl/source/queue_v3.cpp deleted file mode 100644 index 2196e94fffa45..0000000000000 --- a/sycl/source/queue_v3.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//==-------------- queue_v3.cpp --------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// This file implements queue constructors for earlier releases and file -// queue.cpp implements queue constructors for the current release. This enables -// different default queue implementations for old and current user code, a -// feature needed on some platforms. This temporary and will be removed in the -// next release. - -#define __SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO_V3 -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace sycl { -inline namespace _V1 { - -queue::queue(const context &SyclContext, const device_selector &DeviceSelector, - const async_handler &AsyncHandler, const property_list &PropList) { - - const std::vector Devs = SyclContext.get_devices(); - - auto Comp = [&DeviceSelector](const device &d1, const device &d2) { - return DeviceSelector(d1) < DeviceSelector(d2); - }; - - const device &SyclDevice = *std::max_element(Devs.begin(), Devs.end(), Comp); - - impl = std::make_shared( - detail::getSyclObjImpl(SyclDevice), detail::getSyclObjImpl(SyclContext), - AsyncHandler, PropList, true); -} - -queue::queue(const context &SyclContext, const device &SyclDevice, - const async_handler &AsyncHandler, const property_list &PropList) { - impl = std::make_shared( - detail::getSyclObjImpl(SyclDevice), detail::getSyclObjImpl(SyclContext), - AsyncHandler, PropList, true); -} - -queue::queue(const device &SyclDevice, const async_handler &AsyncHandler, - const property_list &PropList) { - impl = std::make_shared( - detail::getSyclObjImpl(SyclDevice), AsyncHandler, PropList, true); -} - -queue::queue(const context &SyclContext, const device_selector &deviceSelector, - const property_list &PropList) - : queue(SyclContext, deviceSelector, - detail::getSyclObjImpl(SyclContext)->get_async_handler(), - PropList) {} - -queue::queue(const context &SyclContext, const device &SyclDevice, - const property_list &PropList) - : queue(SyclContext, SyclDevice, - detail::getSyclObjImpl(SyclContext)->get_async_handler(), - PropList) {} - -} // namespace _V1 -} // namespace sycl diff --git a/sycl/test-e2e/Basic/buffer/buffer.cpp b/sycl/test-e2e/Basic/buffer/buffer.cpp index e0676207b4efe..e40d8d5014a4f 100644 --- a/sycl/test-e2e/Basic/buffer/buffer.cpp +++ b/sycl/test-e2e/Basic/buffer/buffer.cpp @@ -515,9 +515,9 @@ int main() { std::vector bool_vector; std::vector int_vector; std::vector float_vector; - bool_vector.reserve(size); - int_vector.reserve(size); - float_vector.reserve(size); + bool_vector.resize(size); + int_vector.resize(size); + float_vector.resize(size); sycl::queue Queue; std::mutex m; diff --git a/sycl/test-e2e/Basic/built-ins.cpp b/sycl/test-e2e/Basic/built-ins.cpp index 5ec26787c676a..2984ba302a9d4 100644 --- a/sycl/test-e2e/Basic/built-ins.cpp +++ b/sycl/test-e2e/Basic/built-ins.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out | FileCheck %s -// RUN: %{build} -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ -Wno-#warnings -o %t_var.out -// RUN: %{run} %t_var.out | FileCheck %s - // Hits an assertion and kernel page fault with AMD: // UNSUPPORTED: target-amd // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14404 @@ -28,15 +25,6 @@ static const CONSTANT char format[] = "Hello, World! %d %f\n"; int main() { s::queue q{}; -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - if (!q.get_device().has(sycl::aspect::fp64)) { - std::cout << "Test with __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ defined is " - "skipped because the device did not have fp64." - << std::endl; - return 0; - } -#endif - // Test printf q.submit([&](s::handler &CGH) { CGH.single_task([=]() { diff --git a/sycl/test-e2e/CMakeLists.txt b/sycl/test-e2e/CMakeLists.txt index a430d8475f193..26da635642124 100644 --- a/sycl/test-e2e/CMakeLists.txt +++ b/sycl/test-e2e/CMakeLists.txt @@ -98,6 +98,9 @@ if(NOT SYCL_TEST_E2E_STANDALONE) sycl-toolchain FileCheck not + compiler-rt + llvm-profdata + llvm-cov ) endif() # Standalone. diff --git a/sycl/test-e2e/Coverage/device_code_coverage.cpp b/sycl/test-e2e/Coverage/device_code_coverage.cpp new file mode 100644 index 0000000000000..e6996e708f0d9 --- /dev/null +++ b/sycl/test-e2e/Coverage/device_code_coverage.cpp @@ -0,0 +1,64 @@ +// RUN: %{build} -fprofile-instr-generate -fcoverage-mapping -o %t.out +// RUN: %{run} LLVM_PROFILE_FILE=%t.profraw %t.out +// RUN: %{run-aux} llvm-profdata merge %t.profraw -o %t.profdata +// RUN: %{run-aux} llvm-cov show -instr-profile=%t.profdata %t.out -name="main" | FileCheck %s + +#include + +int main() { + sycl::queue q; + int *values = sycl::malloc_shared(10, q); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { + if (idx[0] < 8) + values[idx] = 42; + else + values[idx] = 7; + }); + }).wait(); + for (int i = 0; i < 10; i++) + assert(values[i] == (i < 8 ? 42 : 7)); + sycl::free(values, q); + return 0; +} + +// REQUIRES: target-spir +// UNSUPPORTED: opencl && gpu +// UNSUPPORTED-TRACKER: GSD-4287 +// UNSUPPORTED: windows +// UNSUPPORTED-INTENDED: On Windows, compiler-rt requires /MT but the flag +// cannot be used with SYCL. + +// CHECK: main: +// CHECK: 8| 1|int main() { +// CHECK: 9| 1| sycl::queue q; +// CHECK: 10| 1| int *values = sycl::malloc_shared(10, q); +// CHECK: 11| 1| q.submit([&](sycl::handler &h) { +// CHECK: 12| 1| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 1| if (idx[0] < 8) +// CHECK: 14| 1| values[idx] = 42; +// CHECK: 15| 1| else +// CHECK: 16| 1| values[idx] = 7; +// CHECK: 17| 1| }); +// CHECK: 18| 1| }).wait(); +// CHECK: 19| 11| for (int i = 0; i < 10; i++) +// CHECK: 20| 10| assert(values[i] == (i < 8 ? 42 : 7)); +// CHECK: 21| 1| sycl::free(values, q); +// CHECK: 22| 1| return 0; +// CHECK: 23| 1|} +// CHECK: device_code_coverage.cpp:_ZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_: +// CHECK: 11| 1| q.submit([&](sycl::handler &h) { +// CHECK: 12| 1| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 1| if (idx[0] < 8) +// CHECK: 14| 1| values[idx] = 42; +// CHECK: 15| 1| else +// CHECK: 16| 1| values[idx] = 7; +// CHECK: 17| 1| }); +// CHECK: 18| 1| }).wait(); +// CHECK: device_code_coverage.cpp:_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_: +// CHECK: 12| 10| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 10| if (idx[0] < 8) +// CHECK: 14| 8| values[idx] = 42; +// CHECK: 15| 2| else +// CHECK: 16| 2| values[idx] = 7; +// CHECK: 17| 10| }); diff --git a/sycl/test-e2e/DeviceLib/built-ins/printf.cpp b/sycl/test-e2e/DeviceLib/built-ins/printf.cpp index a0e7bff0d939e..e4608f7fd483e 100644 --- a/sycl/test-e2e/DeviceLib/built-ins/printf.cpp +++ b/sycl/test-e2e/DeviceLib/built-ins/printf.cpp @@ -4,9 +4,6 @@ // // RUN: %{build} -o %t.out // RUN: %{run} %t.out | FileCheck %s -// -// RUN: %{build} -fsycl-device-code-split=per_kernel -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ -Wno-#warnings -o %t_var.out -// RUN: %{run} %t_var.out | FileCheck %s #include #include @@ -98,13 +95,6 @@ int main() { Queue.wait(); } -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - // Currently printf will promote floating point values to doubles. - // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ changes the behavior to use - // a variadic function, so if it is defined it will promote the floating - // point arguments. - if (Queue.get_device().has(sycl::aspect::fp64)) -#endif // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ { Queue.submit([&](handler &CGH) { CGH.single_task([=]() { @@ -120,12 +110,6 @@ int main() { }); Queue.wait(); } -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - else { - std::cout << "Skipped floating point test." << std::endl; - std::cout << "Skipped floating point test." << std::endl; - } -#endif // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ // CHECK-NEXT: {{(33.4|Skipped floating point test.)}} // CHECK-NEXT: {{(-33.4|Skipped floating point test.)}} diff --git a/sycl/test-e2e/ESIMD/printf.cpp b/sycl/test-e2e/ESIMD/printf.cpp index 8f828b5e69f81..b15abe6128d26 100644 --- a/sycl/test-e2e/ESIMD/printf.cpp +++ b/sycl/test-e2e/ESIMD/printf.cpp @@ -10,9 +10,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out | FileCheck %s // -// RUN: %{build} -fsycl-device-code-split=per_kernel -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ -Wno-#warnings -o %t_var.out -// RUN: %{run} %t_var.out | FileCheck %s -// //===----------------------------------------------------------------------===// // // The test checks that ESIMD kernels support printf functionality. @@ -67,13 +64,6 @@ int main() { Queue.wait(); } -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - // Currently printf will promote floating point values to doubles. - // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ changes the behavior to use - // a variadic function, so if it is defined it will promote the floating - // point arguments. - if (Queue.get_device().has(sycl::aspect::fp64)) -#endif // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ { Queue.submit([&](handler &CGH) { CGH.single_task([=]() { @@ -89,12 +79,6 @@ int main() { }); Queue.wait(); } -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - else { - std::cout << "Skipped floating point test." << std::endl; - std::cout << "Skipped floating point test." << std::endl; - } -#endif // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ // CHECK-NEXT: {{(33.4|Skipped floating point test.)}} // CHECK-NEXT: {{(-33.4|Skipped floating point test.)}} diff --git a/sycl/test-e2e/FreeFunctionKernels/structs_with_special_types_as_kernel_paramters.cpp b/sycl/test-e2e/FreeFunctionKernels/structs_with_special_types_as_kernel_paramters.cpp index 72f4ca099fee3..81019f1e548c6 100644 --- a/sycl/test-e2e/FreeFunctionKernels/structs_with_special_types_as_kernel_paramters.cpp +++ b/sycl/test-e2e/FreeFunctionKernels/structs_with_special_types_as_kernel_paramters.cpp @@ -4,9 +4,6 @@ // This test verifies whether struct that contains either sycl::local_accesor or // sycl::accessor can be used with free function kernels extension. -// XFAIL: * -// XFAIL-TRACKER: CMPLRLLVM-67737 - #include #include #include diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp index e4ec248d16d43..d5c7deabc79e8 100644 --- a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp @@ -9,8 +9,6 @@ #include #include -#define __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - using T = int; void add_nodes_to_graph( exp_ext::command_graph &Graph, diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp index 0fb287f85fdbb..c86cdc4ab0670 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp @@ -3,6 +3,9 @@ // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// UNSUPPORTED: level_zero && windows && gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20696 +// // REQUIRES: aspect-usm_host_allocations // Tests injected barrier between an in-order operation in no event mode and a diff --git a/sycl/test-e2e/Printf/float.cpp b/sycl/test-e2e/Printf/float.cpp index 624a5977bc707..1fae377529cc4 100644 --- a/sycl/test-e2e/Printf/float.cpp +++ b/sycl/test-e2e/Printf/float.cpp @@ -8,9 +8,6 @@ // // RUN: %{build} -o %t.out // RUN: %{run} %t.out | FileCheck %s -// FIXME: Remove dedicated variadic printf testing once the option is removed. -// RUN: %{build} -o %t.nonvar.out -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ -Wno-#warnings -// RUN: %{run} %t.nonvar.out | FileCheck %s // FIXME: Remove dedicated constant address space testing once generic AS // support is considered stable. // RUN: %{build} -o %t.constant.out -DTEST_CONSTANT_AS @@ -46,18 +43,6 @@ class FloatTest; int main() { queue q; - -#ifdef __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping the actual test due to variadic argument promotion. " - "Printing hard-coded output from the host side:\n" - << "3.140000e+00, 3.140000E+00\n" - "0x1.91eb86p+1, 0X1.91EB86P+1\n" - "3.14, 3.14" - << std::endl; - return 0; - } -#endif // __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ q.submit([](handler &cgh) { cgh.single_task([]() { do_float_test(); }); }); diff --git a/sycl/test/abi/layout_compile_time_kernel_info.cpp b/sycl/test/abi/layout_compile_time_kernel_info.cpp index 348d268c49a9e..d3a2c802158da 100644 --- a/sycl/test/abi/layout_compile_time_kernel_info.cpp +++ b/sycl/test/abi/layout_compile_time_kernel_info.cpp @@ -11,16 +11,19 @@ void foo(sycl::detail::compile_time_kernel_info_v1::CompileTimeKernelInfoTy) {} // CHECK: 0 | struct sycl::detail::CompileTimeKernelInfoTy // CHECK: 0 | class sycl::detail::string_view Name // CHECK-NEXT: 0 | const char * str -// CHECK-NEXT: 8 | unsigned int NumParams -// CHECK-NEXT: 12 | _Bool IsESIMD -// CHECK-NEXT: 16 | class sycl::detail::string_view FileName -// CHECK-NEXT: 16 | const char * str -// CHECK-NEXT: 24 | class sycl::detail::string_view FunctionName +// CHECK-NEXT: 8 | size_t len +// CHECK-NEXT: 16 | unsigned int NumParams +// CHECK-NEXT: 20 | _Bool IsESIMD +// CHECK-NEXT: 24 | class sycl::detail::string_view FileName // CHECK-NEXT: 24 | const char * str -// CHECK-NEXT: 32 | unsigned int LineNumber -// CHECK-NEXT: 36 | unsigned int ColumnNumber -// CHECK-NEXT: 40 | int64_t KernelSize -// CHECK-NEXT: 48 | ParamDescGetterT ParamDescGetter -// CHECK-NEXT: 56 | _Bool HasSpecialCaptures -// CHECK-NEXT: | [sizeof=64, dsize=57, align=8, -// CHECK-NEXT: | nvsize=57, nvalign=8] +// CHECK-NEXT: 32 | size_t len +// CHECK-NEXT: 40 | class sycl::detail::string_view FunctionName +// CHECK-NEXT: 40 | const char * str +// CHECK-NEXT: 48 | size_t len +// CHECK-NEXT: 56 | unsigned int LineNumber +// CHECK-NEXT: 60 | unsigned int ColumnNumber +// CHECK-NEXT: 64 | int64_t KernelSize +// CHECK-NEXT: 72 | ParamDescGetterT ParamDescGetter +// CHECK-NEXT: 80 | _Bool HasSpecialCaptures +// CHECK-NEXT: | [sizeof=88, dsize=81, align=8, +// CHECK-NEXT: | nvsize=81, nvalign=8] diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp index 8c749786fd67e..cfedce4277a31 100644 --- a/sycl/test/abi/layout_handler.cpp +++ b/sycl/test/abi/layout_handler.cpp @@ -38,35 +38,36 @@ void foo() { // CHECK-NEXT: 56 | pointer _M_end_of_storage // CHECK-NEXT: 64 | class sycl::detail::string_view MKernelName // CHECK-NEXT: 64 | const char * str -// CHECK-NEXT: 72 | class std::shared_ptr MKernel -// CHECK-NEXT: 72 | class std::__shared_ptr (base) -// CHECK-NEXT: 72 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 72 | element_type * _M_ptr -// CHECK-NEXT: 80 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 80 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi -// CHECK-NEXT: 88 | void * MSrcPtr -// CHECK-NEXT: 96 | void * MDstPtr -// CHECK-NEXT: 104 | size_t MLength -// CHECK-NEXT: 112 | class std::vector MPattern -// CHECK-NEXT: 112 | struct std::_Vector_base > (base) -// CHECK-NEXT: 112 | struct std::_Vector_base >::_Vector_impl _M_impl -// CHECK-NEXT: 112 | class std::allocator (base) (empty) -// CHECK: 112 | pointer _M_start -// CHECK-NEXT: 120 | pointer _M_finish -// CHECK-NEXT: 128 | pointer _M_end_of_storage -// CHECK-NEXT: 136 | class std::unique_ptr MHostKernel -// CHECK-NEXT: 136 | struct std::__uniq_ptr_data > -// CHECK: 136 | class std::__uniq_ptr_impl > (base) -// CHECK-NEXT: 136 | class std::tuple > -// CHECK-NEXT: 136 | struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete > (base) -// CHECK-NEXT: 136 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) -// CHECK: 136 | struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base) -// CHECK-NEXT: 136 | class sycl::detail::HostKernelBase * _M_head_impl -// CHECK-NEXT: 144 | struct sycl::detail::code_location MCodeLoc -// CHECK-NEXT: 144 | const char * MFileName -// CHECK-NEXT: 152 | const char * MFunctionName -// CHECK-NEXT: 160 | unsigned long MLineNo -// CHECK-NEXT: 168 | unsigned long MColumnNo +// CHECK-NEXT: 72 | size_t len +// CHECK-NEXT: 80 | class std::shared_ptr MKernel +// CHECK-NEXT: 80 | class std::__shared_ptr (base) +// CHECK-NEXT: 80 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 80 | element_type * _M_ptr +// CHECK-NEXT: 88 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 88 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi +// CHECK-NEXT: 96 | void * MSrcPtr +// CHECK-NEXT: 104 | void * MDstPtr +// CHECK-NEXT: 112 | size_t MLength +// CHECK-NEXT: 120 | class std::vector MPattern +// CHECK-NEXT: 120 | struct std::_Vector_base > (base) +// CHECK-NEXT: 120 | struct std::_Vector_base >::_Vector_impl _M_impl +// CHECK-NEXT: 120 | class std::allocator (base) (empty) +// CHECK: 120 | pointer _M_start +// CHECK-NEXT: 128 | pointer _M_finish +// CHECK-NEXT: 136 | pointer _M_end_of_storage +// CHECK-NEXT: 144 | class std::unique_ptr MHostKernel +// CHECK-NEXT: 144 | struct std::__uniq_ptr_data > +// CHECK: 144 | class std::__uniq_ptr_impl > (base) +// CHECK-NEXT: 144 | class std::tuple > +// CHECK-NEXT: 144 | struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete > (base) +// CHECK-NEXT: 144 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) +// CHECK: 144 | struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base) +// CHECK-NEXT: 144 | class sycl::detail::HostKernelBase * _M_head_impl +// CHECK-NEXT: 152 | struct sycl::detail::code_location MCodeLoc +// CHECK-NEXT: 152 | const char * MFileName +// CHECK-NEXT: 160 | const char * MFunctionName +// CHECK-NEXT: 168 | uint32_t MLineNo +// CHECK-NEXT: 172 | uint32_t MColumnNo // CHECK-NEXT: | [sizeof=176, dsize=176, align=8, // CHECK-NEXT: | nvsize=176, nvalign=8] // clang-format on \ No newline at end of file diff --git a/sycl/test/abi/layout_tls_code_loc_t.cpp b/sycl/test/abi/layout_tls_code_loc_t.cpp index ec1ffc18feb97..0c5dfd8f6ee8e 100644 --- a/sycl/test/abi/layout_tls_code_loc_t.cpp +++ b/sycl/test/abi/layout_tls_code_loc_t.cpp @@ -9,6 +9,7 @@ void foo(sycl::detail::tls_code_loc_t) {} // CHECK: 0 | class sycl::detail::tls_code_loc_t -// CHECK-NEXT: 0 | _Bool MLocalScope -// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, -// CHECK-NEXT: | nvsize=1, nvalign=1] +// CHECK-NEXT: 0 | detail::code_location & CodeLocTLSRef +// CHECK-NEXT: 8 | _Bool MLocalScope +// CHECK-NEXT: | [sizeof=16, dsize=9, align=8, +// CHECK-NEXT: | nvsize=9, nvalign=8] diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index b953f825d5952..ac97996d5532b 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3273,7 +3273,6 @@ _ZN4sycl3_V16detail13lgamma_r_implEfPi _ZN4sycl3_V16detail13make_platformEmNS0_7backendE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEERKNS0_7contextE -_ZN4sycl3_V16detail14addCounterInitERNS0_7handlerERSt10shared_ptrINS1_10queue_implEERS4_IiE _ZN4sycl3_V16detail14getBorderColorENS0_19image_channel_orderE _ZN4sycl3_V16detail14tls_code_loc_t5queryEv _ZN4sycl3_V16detail14tls_code_loc_tC1ERKNS1_13code_locationE @@ -3294,7 +3293,6 @@ _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a _ZN4sycl3_V16detail16get_pointer_typeEPKvRNS1_12context_implE _ZN4sycl3_V16detail16openIPCMemHandleEPKSt4bytemRKNS0_7contextERKNS0_6deviceE _ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm -_ZN4sycl3_V16detail16reduGetMaxWGSizeESt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail17HostProfilingInfo3endEv _ZN4sycl3_V16detail17HostProfilingInfo5startEv _ZN4sycl3_V16detail17device_global_map3addEPKvPKc @@ -3332,7 +3330,6 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE _ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm -_ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE _ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE @@ -3351,7 +3348,6 @@ _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC1ENS0_5rangeILi3EEENS0_6acce _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC2ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE _ZN4sycl3_V16detail33enable_ext_oneapi_default_contextEb _ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsERNS0_7handlerE -_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS1_10queue_implEE _ZN4sycl3_V16detail34addHostSampledImageAccessorAndWaitEPNS1_28SampledImageAccessorImplHostE _ZN4sycl3_V16detail35sampledImageConstructorNotificationEPvS2_RKSt8optionalINS0_12image_targetEEPKvjRKNS1_13code_locationE _ZN4sycl3_V16detail36addHostUnsampledImageAccessorAndWaitEPNS1_30UnsampledImageAccessorImplHostE @@ -3547,6 +3543,7 @@ _ZN4sycl3_V17handler15ext_oneapi_copyEPKvRKNS0_3ext6oneapi12experimental16image_ _ZN4sycl3_V17handler15ext_oneapi_copyEPKvRKNS0_3ext6oneapi12experimental16image_descriptorEmPvS9_m _ZN4sycl3_V17handler16ext_oneapi_graphENS0_3ext6oneapi12experimental13command_graphILNS4_11graph_stateE1EEE _ZN4sycl3_V17handler16getMaxWorkGroupsEv +_ZN4sycl3_V17handler17incrementArgShiftEi _ZN4sycl3_V17handler17supportsUSMFill2DEv _ZN4sycl3_V17handler17use_kernel_bundleERKNS0_13kernel_bundleILNS0_12bundle_stateE2EEE _ZN4sycl3_V17handler18RangeRoundingTraceEv @@ -3708,47 +3705,6 @@ _ZNK4sycl3_V15queue9khr_emptyEv _ZNK4sycl3_V16ONEAPI15filter_selector13select_deviceEv _ZNK4sycl3_V16ONEAPI15filter_selector5resetEv _ZNK4sycl3_V16ONEAPI15filter_selectorclERKNS0_6deviceE -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device10gpu_slicesEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device11free_memoryEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device11pci_addressEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device12gpu_eu_countEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device15max_power_limitEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device15min_power_limitEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device16memory_bus_widthEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device17gpu_eu_simd_widthEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device17max_mem_bandwidthEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device17memory_clock_rateEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device21gpu_hw_threads_per_euEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device23gpu_subslices_per_sliceEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device25gpu_eu_count_per_subsliceEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device25max_compute_queue_indicesEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device30current_clock_throttle_reasonsEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device4uuidEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device9device_idEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device9fan_speedEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel5esimd4info6device23has_2d_block_io_supportEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device12architectureEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi1EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi2EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi3EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device16composite_deviceEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device17component_devicesEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device19matrix_combinationsEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device21image_row_pitch_alignEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device21mipmap_max_anisotropyEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device22max_global_work_groupsEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device22max_image_linear_widthEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device23max_image_linear_heightEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device26max_image_linear_row_pitchEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS6_15execution_scopeE2EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS6_15execution_scopeE3EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS6_15execution_scopeE1EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS6_15execution_scopeE2EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS6_15execution_scopeE3EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS6_15execution_scopeE3EEEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi4info6device17num_compute_unitsEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext8codeplay12experimental4info6device15supports_fusionEEENT_11return_typeEv -_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext8codeplay12experimental4info6device28max_registers_per_work_groupEEENT_11return_typeEv _ZNK4sycl3_V16detail11image_plain10getSamplerEv _ZNK4sycl3_V16detail11image_plain11getPropListEv _ZNK4sycl3_V16detail11image_plain11getRowPitchEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index cac88ba890cfd..a07fc7a6ccf89 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -26,16 +26,6 @@ ??$ext_oneapi_get_info@Unum_sub_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBAIVqueue@12@AEBV?$range@$00@12@@Z ??$ext_oneapi_get_info@Unum_sub_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBAIVqueue@12@AEBV?$range@$01@12@@Z ??$ext_oneapi_get_info@Unum_sub_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBAIVqueue@12@AEBV?$range@$02@12@@Z -??$get_info@U?$max_work_groups@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$00@23@XZ -??$get_info@U?$max_work_groups@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$01@23@XZ -??$get_info@U?$max_work_groups@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$02@23@XZ -??$get_info@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@Uarchitecture@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AW4architecture@experimental@oneapi@ext@23@XZ ??$get_info@Uatomic_fence_order_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_order@_V1@sycl@@V?$allocator@W4memory_order@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uatomic_fence_scope_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_scope@_V1@sycl@@V?$allocator@W4memory_scope@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uatomic_memory_order_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_order@_V1@sycl@@V?$allocator@W4memory_order@_V1@sycl@@@std@@@std@@XZ @@ -44,43 +34,14 @@ ??$get_info@Ucompile_num_sub_groups@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@@Z ??$get_info@Ucompile_sub_group_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@@Z ??$get_info@Ucompile_work_group_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBA?AV?$range@$02@12@AEBVdevice@12@@Z -??$get_info@Ucomponent_devices@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ -??$get_info@Ucomposite_device@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AVdevice@23@XZ ??$get_info@Ucontext@queue@info@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVcontext@12@XZ -??$get_info@Ucurrent_clock_throttle_reasons@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4throttle_reason@intel@ext@_V1@sycl@@V?$allocator@W4throttle_reason@intel@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@Udevice@queue@info@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVdevice@12@XZ -??$get_info@Udevice_id@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ ??$get_info@Udevices@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uext_codeplay_num_regs@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@@Z -??$get_info@Ufan_speed@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAHXZ -??$get_info@Ufree_memory@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ ??$get_info@Uglobal_work_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBA?AV?$range@$02@12@AEBVdevice@12@@Z -??$get_info@Ugpu_eu_count@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Ugpu_eu_count_per_subslice@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Ugpu_eu_simd_width@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Ugpu_hw_threads_per_eu@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Ugpu_slices@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Ugpu_subslices_per_slice@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Uhas_2d_block_io_support@device@info@esimd@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_NXZ -??$get_info@Uimage_row_pitch_align@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Umatrix_combinations@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@Ucombination@matrix@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Ucombination@matrix@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@Umax_compute_queue_indices@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAHXZ -??$get_info@Umax_global_work_groups@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ -??$get_info@Umax_image_linear_height@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ -??$get_info@Umax_image_linear_row_pitch@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ -??$get_info@Umax_image_linear_width@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ -??$get_info@Umax_mem_bandwidth@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ ??$get_info@Umax_num_sub_groups@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@@Z -??$get_info@Umax_power_limit@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAHXZ -??$get_info@Umax_registers_per_work_group@device@info@experimental@codeplay@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ ??$get_info@Umax_sub_group_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@@Z ??$get_info@Umax_sub_group_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBAIAEBVdevice@12@AEBV?$range@$02@12@@Z -??$get_info@Umemory_bus_width@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Umemory_clock_rate@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ -??$get_info@Umin_power_limit@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAHXZ -??$get_info@Umipmap_max_anisotropy@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAMXZ -??$get_info@Unum_compute_units@device@info@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ -??$get_info@Upci_address@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ ??$get_info@Uplatform@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AVplatform@12@XZ ??$get_info@Upreferred_work_group_size_multiple@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVdevice@12@@Z ??$get_info@Uprivate_mem_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVdevice@12@@Z @@ -88,8 +49,6 @@ ??$get_info@Ureference_count@event@info@_V1@sycl@@@event@_V1@sycl@@QEBAIXZ ??$get_info@Ureference_count@queue@info@_V1@sycl@@@queue@_V1@sycl@@QEBAIXZ ??$get_info@Uspill_memory_size@kernel_device_specific@info@intel@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVdevice@12@@Z -??$get_info@Usupports_fusion@device@info@experimental@codeplay@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_NXZ -??$get_info@Uuuid@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$array@E$0BA@@std@@XZ ??$get_info@Uwork_group_size@kernel_device_specific@info@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVdevice@12@@Z ??$get_info_impl@U?$max_work_groups@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$00@12@XZ ??$get_info_impl@U?$max_work_groups@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$01@12@XZ @@ -627,7 +586,6 @@ ??4sampler@_V1@sycl@@QEAAAEAV012@AEBV012@@Z ??4stream@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4stream@_V1@sycl@@QEAAAEAV012@AEBV012@@Z -??4tls_code_loc_t@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??8context@_V1@sycl@@QEBA_NAEBV012@@Z ??8device@_V1@sycl@@QEBA_NAEBV012@@Z ??8device_image_plain@detail@_V1@sycl@@QEBA_NAEBV0123@@Z @@ -3733,7 +3691,6 @@ ?add@host_pipe_map@detail@_V1@sycl@@YAXPEBXPEBD@Z ?add@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA?AVnode@34567@AEBVproperty_list@67@@Z ?addArg@handler@_V1@sycl@@AEAAXW4kernel_param_kind_t@detail@23@PEAXHH@Z -?addCounterInit@detail@_V1@sycl@@YAXAEAVhandler@23@AEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@AEAV?$shared_ptr@H@6@@Z ?addGraphLeafDependencies@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXVnode@34567@@Z ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z @@ -3789,6 +3746,7 @@ ?cancel_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ ?category@exception@_V1@sycl@@QEBAAEBVerror_category@std@@XZ ?checkNodePropertiesAndThrow@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@KAXAEBVproperty_list@67@@Z +?incrementArgShift@handler@_V1@sycl@@AEAAXH@Z ?close@ipc_memory@experimental@oneapi@ext@_V1@sycl@@YAXPEAXAEBVcontext@56@@Z ?code@exception@_V1@sycl@@QEBAAEBVerror_code@std@@XZ ?compile_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$00@56@AEAV?$kernel_bundle@$02@56@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@Vstring_view@detail@_V1@sycl@@V?$allocator@Vstring_view@detail@_V1@sycl@@@std@@@std@@PEAVstring@156@2@Z @@ -4328,10 +4286,7 @@ ?query@tls_code_loc_t@detail@_V1@sycl@@QEAAAEBUcode_location@234@XZ ?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIAEAVhandler@23@@Z -?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z -?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z -?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z ?registerDynamicParameter@handler@_V1@sycl@@AEAAXPEAVdynamic_parameter_impl@detail@experimental@oneapi@ext@23@H@Z ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 5f0279580e657..2d66b5aba8692 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -52,11 +52,7 @@ int main() { check(); check(); check(); -#ifdef _MSC_VER - check(); -#else check(); -#endif check, 16, 8>(); check(); check(); diff --git a/sycl/test/basic_tests/logical_operations.cpp b/sycl/test/basic_tests/logical_operations.cpp index e040470cecd04..ccd052ba323ba 100644 --- a/sycl/test/basic_tests/logical_operations.cpp +++ b/sycl/test/basic_tests/logical_operations.cpp @@ -1,4 +1,3 @@ -// RUN: %clang -fpreview-breaking-changes -fsycl -fsyntax-only %s // RUN: %clang -fsycl -fsyntax-only %s #include @@ -10,16 +9,9 @@ int main() { const auto logicalOr = sycl::logical_or(); const auto logicalAndVoid = sycl::logical_and(); const auto logicalOrVoid = sycl::logical_or(); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES static_assert(std::is_same_v); static_assert(std::is_same_v); static_assert(std::is_same_v); static_assert(std::is_same_v); -#else - static_assert(std::is_same_v); - static_assert(std::is_same_v); - static_assert(std::is_same_v); - static_assert(std::is_same_v); -#endif return 0; } diff --git a/sycl/test/extensions/experimental-printf.cpp b/sycl/test/extensions/experimental-printf.cpp deleted file mode 100644 index 3efc00bbca2c9..0000000000000 --- a/sycl/test/extensions/experimental-printf.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// This test is intended to check that internal -// __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ works as expected, i.e. we can -// see printf ExtInst regardless of the macro presence and that argument -// promotion is disabled if the macro is present. -// -// RUN: %clangxx -fsycl -fsycl-device-only -fno-sycl-use-bitcode %s -o %t.spv -// RUN: llvm-spirv -to-text %t.spv -o %t.spt -// RUN: FileCheck %s --check-prefixes CHECK,CHECK-FLOAT < %t.spt -// -// RUN: %clangxx -fsycl -fsycl-device-only -fno-sycl-use-bitcode -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ %s -o %t.spv -// RUN: llvm-spirv -to-text %t.spv -o %t.spt -// RUN: FileCheck %s --check-prefixes CHECK,CHECK-DOUBLE < %t.spt - -// CHECK-FLOAT: TypeFloat [[#TYPE:]] 32 -// CHECK-DOUBLE: TypeFloat [[#TYPE:]] 64 -// CHECK: Constant [[#TYPE]] [[#CONST:]] -// CHECK: ExtInst [[#]] [[#]] [[#]] printf [[#]] [[#CONST]] - -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define __SYCL_CONSTANT_AS __attribute__((opencl_constant)) -#else -#define __SYCL_CONSTANT_AS -#endif - -const __SYCL_CONSTANT_AS char fmt[] = "Hello, World! %f\n"; - -int main() { - sycl::queue q; - - q.submit([&](sycl::handler &cgh) { - cgh.single_task([=]() { - float f = 3.14; - sycl::ext::oneapi::experimental::printf(fmt, f); - }); - }); - - return 0; -} diff --git a/sycl/test/group_algorithms/logical_or_and_group_algorithms.cpp b/sycl/test/group_algorithms/logical_or_and_group_algorithms.cpp index 5553efe364fbc..15ff3a7e5bd0b 100644 --- a/sycl/test/group_algorithms/logical_or_and_group_algorithms.cpp +++ b/sycl/test/group_algorithms/logical_or_and_group_algorithms.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -Xclang -verify=expected -Xclang -verify-ignore-unexpected=note -fpreview-breaking-changes -fsyntax-only -fsycl-device-only -ferror-limit=0 %s +// RUN: %clangxx -fsycl -Xclang -verify=expected -Xclang -verify-ignore-unexpected=note -fsyntax-only -fsycl-device-only -ferror-limit=0 %s #include diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 2544bc16485d5..0fc0058caf6ab 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -147,6 +147,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp index 94b3cbf15d8d1..3beaefcc56711 100644 --- a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp @@ -144,6 +144,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp index a1ebb06b56591..698aa9a6115ee 100644 --- a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp @@ -146,9 +146,9 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp -// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: specialization_id.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index cf2ff1ec5bbac..f2ad156db54f0 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -151,6 +151,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp index 53e1da9695d09..ecfbf51d76b0f 100644 --- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp @@ -174,6 +174,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp index 7feda0438dee3..540be89282a9a 100644 --- a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp @@ -163,6 +163,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index 7b0becee030d4..0d78e36b146f5 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -166,6 +166,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp diff --git a/sycl/test/warnings/variadic_ocl_printf.cpp b/sycl/test/warnings/variadic_ocl_printf.cpp deleted file mode 100644 index 74caff358136d..0000000000000 --- a/sycl/test/warnings/variadic_ocl_printf.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// RUN: %clangxx -D__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s - -// expected-warning@*:* {{__SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ is deprecated and will be removed in a future release.}} -#include - diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 75cb616f48dd4..e0f18d53ffe35 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -497,8 +497,6 @@ typedef enum ur_function_t { UR_FUNCTION_QUEUE_BEGIN_CAPTURE_INTO_GRAPH_EXP = 298, /// Enumerator for ::urQueueEndGraphCaptureExp UR_FUNCTION_QUEUE_END_GRAPH_CAPTURE_EXP = 299, - /// Enumerator for ::urQueueAppendGraphExp - UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP = 301, /// Enumerator for ::urGraphDestroyExp UR_FUNCTION_GRAPH_DESTROY_EXP = 302, /// Enumerator for ::urGraphExecutableGraphDestroyExp @@ -511,6 +509,8 @@ typedef enum ur_function_t { UR_FUNCTION_GRAPH_DUMP_CONTENTS_EXP = 306, /// Enumerator for ::urGraphInstantiateGraphExp UR_FUNCTION_GRAPH_INSTANTIATE_GRAPH_EXP = 307, + /// Enumerator for ::urEnqueueGraphExp + UR_FUNCTION_ENQUEUE_GRAPH_EXP = 308, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2489,7 +2489,7 @@ typedef enum ur_device_info_t { UR_DEVICE_INFO_CLOCK_DEVICE_SUPPORT_EXP = 0x2062, /// [::ur_bool_t] returns true if the device is integrated GPU. UR_DEVICE_INFO_IS_INTEGRATED_GPU = 0x2070, - /// [::ur_bool_t] returns true if the device supports graph record and replay + /// [::ur_bool_t] Returns true if the device supports graph record and replay /// functionality. UR_DEVICE_INFO_GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP = 0x2080, /// [::ur_bool_t] Returns true if the device supports the USM P2P @@ -7409,6 +7409,8 @@ typedef enum ur_command_t { UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP = 0x2052, /// Event created by ::urEnqueueUSMFreeExp UR_COMMAND_ENQUEUE_USM_FREE_EXP = 0x2053, + /// Event created by ::urEnqueueGraphExp + UR_COMMAND_ENQUEUE_GRAPH_EXP = 0x2100, /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -13515,20 +13517,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( #pragma region graph_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// -/// @brief Handle of record & replay graph object +/// @brief Handle of record & replay graph object. typedef struct ur_exp_graph_handle_t_ *ur_exp_graph_handle_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Handle of record & replay executable graph object +/// @brief Handle of record & replay executable graph object. typedef struct ur_exp_executable_graph_handle_t_ *ur_exp_executable_graph_handle_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Create a new record & replay graph instance explicitly. /// -/// @details -/// - Create a new record & replay graph instance explicitly. -/// /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_UNINITIALIZED @@ -13538,8 +13537,6 @@ typedef struct ur_exp_executable_graph_handle_t_ /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urGraphCreateExp( /// [in] Handle of the context object. ur_context_handle_t hContext, @@ -13547,7 +13544,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphCreateExp( ur_exp_graph_handle_t *phGraph); /////////////////////////////////////////////////////////////////////////////// -/// @brief Begin graph capture on the specified immediate queue. +/// @brief Begin graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -13556,15 +13553,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphCreateExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue); /////////////////////////////////////////////////////////////////////////////// /// @brief Begin capturing commands into an existing graph on the specified -/// immediate queue. +/// queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -13574,8 +13569,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue, @@ -13583,7 +13576,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( ur_exp_graph_handle_t hGraph); /////////////////////////////////////////////////////////////////////////////// -/// @brief End graph capture on the specified immediate queue. +/// @brief End graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -13594,8 +13587,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// [in] Handle of the queue on which to end graph capture. ur_queue_handle_t hQueue, @@ -13616,8 +13607,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phExecGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// [in] Handle of the recorded graph to instantiate. ur_exp_graph_handle_t hGraph, @@ -13625,7 +13614,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphInstantiateGraphExp( ur_exp_executable_graph_handle_t *phExecGraph); /////////////////////////////////////////////////////////////////////////////// -/// @brief Append an executable graph to the queue. +/// @brief Enqueue an executable graph onto the queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -13635,20 +13624,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT -UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents); + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent); /////////////////////////////////////////////////////////////////////////////// /// @brief Destroy a recorded graph object. All executable graph instances @@ -13662,8 +13658,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urGraphDestroyExp( /// [in] Handle of the graph object to destroy. ur_exp_graph_handle_t hGraph); @@ -13679,8 +13673,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphDestroyExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hExecutableGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT UR_APIEXPORT ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// [in] Handle of the executable graph object to destroy. ur_exp_executable_graph_handle_t hExecutableGraph); @@ -13696,14 +13688,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT +/// + `NULL == pResult` UR_APIEXPORT ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult); + bool *pResult); /////////////////////////////////////////////////////////////////////////////// /// @brief Return whether the given recorded graph contains any nodes. @@ -13716,14 +13706,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// + `NULL == pResult` +/// - ::UR_RESULT_ERROR_INVALID_GRAPH UR_APIEXPORT ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult); + bool *pResult); /////////////////////////////////////////////////////////////////////////////// /// @brief Dump the contents of the recorded graph to the provided file path. @@ -13737,9 +13726,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGraphIsEmptyExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == filePath` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL urGraphDumpContentsExp( /// [in] Handle of the graph to dump. ur_exp_graph_handle_t hGraph, @@ -14591,25 +14577,13 @@ typedef struct ur_queue_end_graph_capture_exp_params_t { ur_exp_graph_handle_t **pphGraph; } ur_queue_end_graph_capture_exp_params_t; -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urQueueAppendGraphExp -/// @details Each entry is a pointer to the parameter passed to the function; -/// allowing the callback the ability to modify the parameter's value -typedef struct ur_queue_append_graph_exp_params_t { - ur_queue_handle_t *phQueue; - ur_exp_executable_graph_handle_t *phGraph; - ur_event_handle_t *phSignalEvent; - uint32_t *pnumWaitEvents; - ur_event_handle_t **pphWaitEvents; -} ur_queue_append_graph_exp_params_t; - /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urQueueIsGraphCaptureEnabledExp /// @details Each entry is a pointer to the parameter passed to the function; /// allowing the callback the ability to modify the parameter's value typedef struct ur_queue_is_graph_capture_enabled_exp_params_t { ur_queue_handle_t *phQueue; - bool **phResult; + bool **ppResult; } ur_queue_is_graph_capture_enabled_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -15354,6 +15328,18 @@ typedef struct ur_enqueue_native_command_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_native_command_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueGraphExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_graph_exp_params_t { + ur_queue_handle_t *phQueue; + ur_exp_executable_graph_handle_t *phGraph; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_graph_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urUSMHostAlloc /// @details Each entry is a pointer to the parameter passed to the function; @@ -16256,7 +16242,7 @@ typedef struct ur_graph_executable_graph_destroy_exp_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_graph_is_empty_exp_params_t { ur_exp_graph_handle_t *phGraph; - bool **phResult; + bool **ppResult; } ur_graph_is_empty_exp_params_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index 9deb1baf0b193..fba1cec526697 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -91,7 +91,6 @@ _UR_API(urQueueFlush) _UR_API(urQueueBeginGraphCaptureExp) _UR_API(urQueueBeginCaptureIntoGraphExp) _UR_API(urQueueEndGraphCaptureExp) -_UR_API(urQueueAppendGraphExp) _UR_API(urQueueIsGraphCaptureEnabledExp) _UR_API(urSamplerCreate) _UR_API(urSamplerRetain) @@ -147,6 +146,7 @@ _UR_API(urEnqueueUSMFreeExp) _UR_API(urEnqueueCommandBufferExp) _UR_API(urEnqueueTimestampRecordingExp) _UR_API(urEnqueueNativeCommandExp) +_UR_API(urEnqueueGraphExp) _UR_API(urUSMHostAlloc) _UR_API(urUSMDeviceAlloc) _UR_API(urUSMSharedAlloc) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 28f7d75d7b523..3e473faa1f7b2 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -698,12 +698,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueBeginCaptureIntoGraphExp_t)( typedef ur_result_t(UR_APICALL *ur_pfnQueueEndGraphCaptureExp_t)( ur_queue_handle_t, ur_exp_graph_handle_t *); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urQueueAppendGraphExp -typedef ur_result_t(UR_APICALL *ur_pfnQueueAppendGraphExp_t)( - ur_queue_handle_t, ur_exp_executable_graph_handle_t, ur_event_handle_t, - uint32_t, ur_event_handle_t *); - /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urQueueIsGraphCaptureEnabledExp typedef ur_result_t(UR_APICALL *ur_pfnQueueIsGraphCaptureEnabledExp_t)( @@ -715,7 +709,6 @@ typedef struct ur_queue_exp_dditable_t { ur_pfnQueueBeginGraphCaptureExp_t pfnBeginGraphCaptureExp; ur_pfnQueueBeginCaptureIntoGraphExp_t pfnBeginCaptureIntoGraphExp; ur_pfnQueueEndGraphCaptureExp_t pfnEndGraphCaptureExp; - ur_pfnQueueAppendGraphExp_t pfnAppendGraphExp; ur_pfnQueueIsGraphCaptureEnabledExp_t pfnIsGraphCaptureEnabledExp; } ur_queue_exp_dditable_t; @@ -1214,6 +1207,12 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueGraphExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueGraphExp_t)( + ur_queue_handle_t, ur_exp_executable_graph_handle_t, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { @@ -1225,6 +1224,7 @@ typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueCommandBufferExp_t pfnCommandBufferExp; ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; ur_pfnEnqueueNativeCommandExp_t pfnNativeCommandExp; + ur_pfnEnqueueGraphExp_t pfnGraphExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index 251e0f9763a27..e0cbe0c89d488 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -2341,16 +2341,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueEndGraphCaptureExpParams( const struct ur_queue_end_graph_capture_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_queue_append_graph_exp_params_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueAppendGraphExpParams( - const struct ur_queue_append_graph_exp_params_t *params, char *buffer, - const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_queue_is_graph_capture_enabled_exp_params_t struct /// @returns @@ -2905,6 +2895,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueNativeCommandExpParams( const struct ur_enqueue_native_command_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_graph_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueGraphExpParams( + const struct ur_enqueue_graph_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_usm_host_alloc_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index a3afc20786add..27a7d4ea90ad2 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -1337,9 +1337,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_QUEUE_END_GRAPH_CAPTURE_EXP: os << "UR_FUNCTION_QUEUE_END_GRAPH_CAPTURE_EXP"; break; - case UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP: - os << "UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP"; - break; case UR_FUNCTION_GRAPH_DESTROY_EXP: os << "UR_FUNCTION_GRAPH_DESTROY_EXP"; break; @@ -1358,6 +1355,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_GRAPH_INSTANTIATE_GRAPH_EXP: os << "UR_FUNCTION_GRAPH_INSTANTIATE_GRAPH_EXP"; break; + case UR_FUNCTION_ENQUEUE_GRAPH_EXP: + os << "UR_FUNCTION_ENQUEUE_GRAPH_EXP"; + break; default: os << "unknown enumerator"; break; @@ -10799,6 +10799,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_ENQUEUE_USM_FREE_EXP: os << "UR_COMMAND_ENQUEUE_USM_FREE_EXP"; break; + case UR_COMMAND_ENQUEUE_GRAPH_EXP: + os << "UR_COMMAND_ENQUEUE_GRAPH_EXP"; + break; default: os << "unknown enumerator"; break; @@ -15306,52 +15309,6 @@ operator<<(std::ostream &os, return os; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_queue_append_graph_exp_params_t type -/// @returns -/// std::ostream & -inline std::ostream &operator<<( - std::ostream &os, - [[maybe_unused]] const struct ur_queue_append_graph_exp_params_t *params) { - - os << ".hQueue = "; - - ur::details::printPtr(os, *(params->phQueue)); - - os << ", "; - os << ".hGraph = "; - - ur::details::printPtr(os, *(params->phGraph)); - - os << ", "; - os << ".hSignalEvent = "; - - ur::details::printPtr(os, *(params->phSignalEvent)); - - os << ", "; - os << ".numWaitEvents = "; - - os << *(params->pnumWaitEvents); - - os << ", "; - os << ".phWaitEvents = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->pphWaitEvents))); - if (*(params->pphWaitEvents) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumWaitEvents; ++i) { - if (i != 0) { - os << ", "; - } - - ur::details::printPtr(os, (*(params->pphWaitEvents))[i]); - } - os << "}"; - } - - return os; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_queue_is_graph_capture_enabled_exp_params_t /// type @@ -15367,9 +15324,9 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->phQueue)); os << ", "; - os << ".hResult = "; + os << ".pResult = "; - ur::details::printPtr(os, *(params->phResult)); + ur::details::printPtr(os, *(params->ppResult)); return os; } @@ -18141,6 +18098,52 @@ operator<<(std::ostream &os, return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_graph_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_enqueue_graph_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, *(params->phQueue)); + + os << ", "; + os << ".hGraph = "; + + ur::details::printPtr(os, *(params->phGraph)); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->pphEventWaitList))); + if (*(params->pphEventWaitList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); + } + os << "}"; + } + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_usm_host_alloc_params_t type /// @returns @@ -21097,9 +21100,9 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->phGraph)); os << ", "; - os << ".hResult = "; + os << ".pResult = "; - ur::details::printPtr(os, *(params->phResult)); + ur::details::printPtr(os, *(params->ppResult)); return os; } @@ -22337,9 +22340,6 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_QUEUE_END_GRAPH_CAPTURE_EXP: { os << (const struct ur_queue_end_graph_capture_exp_params_t *)params; } break; - case UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP: { - os << (const struct ur_queue_append_graph_exp_params_t *)params; - } break; case UR_FUNCTION_QUEUE_IS_GRAPH_CAPTURE_ENABLED_EXP: { os << (const struct ur_queue_is_graph_capture_enabled_exp_params_t *)params; } break; @@ -22511,6 +22511,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP: { os << (const struct ur_enqueue_native_command_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_GRAPH_EXP: { + os << (const struct ur_enqueue_graph_exp_params_t *)params; + } break; case UR_FUNCTION_USM_HOST_ALLOC: { os << (const struct ur_usm_host_alloc_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/EXP-GRAPH.rst b/unified-runtime/scripts/core/EXP-GRAPH.rst index 7b3dff39fe787..344242c30cfb8 100644 --- a/unified-runtime/scripts/core/EXP-GRAPH.rst +++ b/unified-runtime/scripts/core/EXP-GRAPH.rst @@ -48,8 +48,8 @@ Functions * ${x}QueueBeginGraphCaptureExp * ${x}QueueBeginCaptureIntoGraphExp * ${x}QueueEndGraphCaptureExp - * ${x}QueueAppendGraphExp * ${x}QueueIsGraphCaptureEnabledExp + * ${x}EnqueueGraphExp Changelog -------------------------------------------------------------------------------- @@ -62,6 +62,12 @@ Changelog | 1.1 | Extend ${x}_device_info_t enumerator with | | | graph record and replay entry. | +-----------+---------------------------------------------+ +| 1.2 | Extend ${x}_command_t enumerator with | +| | enqueue graph event entry. Cleanup spec | +| | entry descriptions and return values. | +| | Rename QueueAppendGraphExp into | +| | EnqueueGraphExp. | ++-----------+---------------------------------------------+ Support -------------------------------------------------------------------------------- diff --git a/unified-runtime/scripts/core/exp-graph.yml b/unified-runtime/scripts/core/exp-graph.yml index f2f6b9dc258e7..2c2306107abf7 100644 --- a/unified-runtime/scripts/core/exp-graph.yml +++ b/unified-runtime/scripts/core/exp-graph.yml @@ -7,21 +7,16 @@ # # See YaML.md for syntax definition # -# TODO: -# ZE_RESULT_ERROR_INVALID_GRAPH -# ZE_RESULT_QUERY_TRUE -# ZE_RESULT_QUERY_FALSE -# --- #-------------------------------------------------------------------------- type: header desc: "Intel $OneApi Unified Runtime Experimental APIs for Graph Record and Replay" --- #-------------------------------------------------------------------------- type: handle -desc: "Handle of record & replay graph object" +desc: "Handle of record & replay graph object." name: "$x_exp_graph_handle_t" --- #-------------------------------------------------------------------------- type: handle -desc: "Handle of record & replay executable graph object" +desc: "Handle of record & replay executable graph object." name: "$x_exp_executable_graph_handle_t" --- #-------------------------------------------------------------------------- type: enum @@ -33,16 +28,23 @@ etors: - name: GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP value: "0x2080" desc: | - [$x_bool_t] returns true if the device supports graph record and replay + [$x_bool_t] Returns true if the device supports graph record and replay functionality. --- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Command Type experimental enumerations." +name: $x_command_t +etors: + - name: ENQUEUE_GRAPH_EXP + value: "0x2100" + desc: "Event created by $xEnqueueGraphExp" +--- #-------------------------------------------------------------------------- type: function desc: "Create a new record & replay graph instance explicitly." class: $xGraph name: CreateExp decl: static -details: - - "Create a new record & replay graph instance explicitly." params: - type: $x_context_handle_t name: hContext @@ -50,91 +52,82 @@ params: - type: $x_exp_graph_handle_t* name: phGraph desc: "[out][alloc] Pointer to the handle of the created graph object." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT --- #-------------------------------------------------------------------------- type: function -desc: "Begin graph capture on the specified immediate queue." +desc: "Begin graph capture on the specified queue." class: $xQueue name: BeginGraphCaptureExp params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] Handle of the queue on which to begin graph capture." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT + - type: $x_queue_handle_t + name: hQueue + desc: "[in] Handle of the queue on which to begin graph capture." --- #-------------------------------------------------------------------------- type: function -desc: "Begin capturing commands into an existing graph on the specified immediate queue." +desc: "Begin capturing commands into an existing graph on the specified queue." class: $xQueue name: BeginCaptureIntoGraphExp params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] Handle of the queue on which to begin graph capture." - - type: $x_exp_graph_handle_t - name: hGraph - desc: "[in] Handle of the graph object to capture into." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT + - type: $x_queue_handle_t + name: hQueue + desc: "[in] Handle of the queue on which to begin graph capture." + - type: $x_exp_graph_handle_t + name: hGraph + desc: "[in] Handle of the graph object to capture into." --- #-------------------------------------------------------------------------- type: function -desc: "End graph capture on the specified immediate queue." +desc: "End graph capture on the specified queue." class: $xQueue name: EndGraphCaptureExp params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] Handle of the queue on which to end graph capture." - - type: $x_exp_graph_handle_t* - name: phGraph - desc: "[out] Pointer to the handle of the recorded graph object. If $xQueueBeginCaptureIntoGraphExp - was used to begin the capture, then phGraph will contain the same graph that was passed to it." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT + - type: $x_queue_handle_t + name: hQueue + desc: "[in] Handle of the queue on which to end graph capture." + - type: $x_exp_graph_handle_t* + name: phGraph + desc: "[out] Pointer to the handle of the recorded graph object. If $xQueueBeginCaptureIntoGraphExp + was used to begin the capture, then phGraph will contain the same graph that was passed to it." --- #-------------------------------------------------------------------------- type: function desc: "Instantiate an executable graph from a recorded graph." class: $xGraph name: InstantiateGraphExp params: - - type: $x_exp_graph_handle_t - name: hGraph - desc: "[in] Handle of the recorded graph to instantiate." - - type: $x_exp_executable_graph_handle_t* - name: phExecGraph - desc: "[out] Pointer to the handle of the instantiated executable graph." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT + - type: $x_exp_graph_handle_t + name: hGraph + desc: "[in] Handle of the recorded graph to instantiate." + - type: $x_exp_executable_graph_handle_t* + name: phExecGraph + desc: "[out] Pointer to the handle of the instantiated executable graph." --- #-------------------------------------------------------------------------- type: function -desc: "Append an executable graph to the queue." -class: $xQueue -name: AppendGraphExp +desc: "Enqueue an executable graph onto the queue." +class: $xEnqueue +name: GraphExp params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] Handle of the queue to append the graph to." - - type: $x_exp_executable_graph_handle_t - name: hGraph - desc: "[in] Handle of the executable graph to append." - - type: $x_event_handle_t - name: hSignalEvent - desc: "[in][optional] Event to be signaled on completion." - - type: uint32_t - name: numWaitEvents - desc: "[in][optional] Number of events to wait on before executing." - - type: $x_event_handle_t* - name: phWaitEvents - desc: "[in][optional][range(0, numWaitEvents)] Handle of the events to wait on before launching." + - type: $x_queue_handle_t + name: hQueue + desc: "[in] Handle of the queue to which the graph will be enqueued." + - type: $x_exp_executable_graph_handle_t + name: hGraph + desc: "[in] Handle of the executable graph to be enqueued." + - type: uint32_t + name: numEventsInWaitList + desc: "[in][optional] Number of events to wait on before executing." + - type: const $x_event_handle_t* + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] Pointer to a list of events that must be complete before this command can be executed. + If nullptr, the numEventsInWaitList must be 0, indicating that this command does not wait on any event to complete. + - type: $x_event_handle_t* + name: phEvent + desc: | + [out][optional][alloc] Event object that identifies this particular command instance. + If phEventWaitList and phEvent are not nullptr, phEvent must not refer to an element of the phEventWaitList array. returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." --- #-------------------------------------------------------------------------- type: function desc: "Destroy a recorded graph object. All executable graph instances created from this recorded graph must be destroyed before calling this function." @@ -144,9 +137,6 @@ params: - type: $x_exp_graph_handle_t name: hGraph desc: "[in] Handle of the graph object to destroy." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT --- #-------------------------------------------------------------------------- type: function desc: "Destroy an instantiated executable graph object. The graph instance must not be executing on any queue." @@ -156,9 +146,6 @@ params: - type: $x_exp_executable_graph_handle_t name: hExecutableGraph desc: "[in] Handle of the executable graph object to destroy." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT --- #-------------------------------------------------------------------------- type: function desc: "Query whether graph capture is currently enabled on the given queue." @@ -169,11 +156,8 @@ params: name: hQueue desc: "[in] Native queue to query." - type: bool* - name: hResult + name: pResult desc: "[out] Pointer to a boolean where the result will be stored." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_INVALID_ARGUMENT --- #-------------------------------------------------------------------------- type: function desc: "Return whether the given recorded graph contains any nodes." @@ -184,11 +168,10 @@ params: name: hGraph desc: "[in] Handle of the graph to query." - type: bool* - name: hResult + name: pResult desc: "[out] Pointer to a boolean where the result will be stored." returns: - - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_INVALID_GRAPH --- #-------------------------------------------------------------------------- type: function desc: "Dump the contents of the recorded graph to the provided file path." @@ -201,7 +184,3 @@ params: - type: const char* name: filePath desc: "[in] Path to the file to write the dumped graph contents." -returns: - - $X_RESULT_ERROR_INVALID_VALUE - - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index 7fa6ab676fbc7..3146664d412ac 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -703,9 +703,6 @@ etors: - name: QUEUE_END_GRAPH_CAPTURE_EXP desc: Enumerator for $xQueueEndGraphCaptureExp value: '299' -- name: QUEUE_APPEND_GRAPH_EXP - desc: Enumerator for $xQueueAppendGraphExp - value: '301' - name: GRAPH_DESTROY_EXP desc: Enumerator for $xGraphDestroyExp value: '302' @@ -724,7 +721,10 @@ etors: - name: GRAPH_INSTANTIATE_GRAPH_EXP desc: Enumerator for $xGraphInstantiateGraphExp value: '307' -max_id: '307' +- name: ENQUEUE_GRAPH_EXP + desc: Enumerator for $xEnqueueGraphExp + value: '308' +max_id: '308' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/cuda/queue.cpp b/unified-runtime/source/adapters/cuda/queue.cpp index be3a71cc5299d..d9ad7eed159f4 100644 --- a/unified-runtime/source/adapters/cuda/queue.cpp +++ b/unified-runtime/source/adapters/cuda/queue.cpp @@ -275,11 +275,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( - ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 15e6ea7582dd5..88122acce13d2 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -259,7 +259,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginGraphCaptureExp = urQueueBeginGraphCaptureExp; pDdiTable->pfnBeginCaptureIntoGraphExp = urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = urQueueIsGraphCaptureEnabledExp; return UR_RESULT_SUCCESS; @@ -496,6 +495,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnUSMFreeExp = urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnGraphExp = urEnqueueGraphExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/hip/queue.cpp b/unified-runtime/source/adapters/hip/queue.cpp index 3f9b5fee40adc..362e4b6d7f2bd 100644 --- a/unified-runtime/source/adapters/hip/queue.cpp +++ b/unified-runtime/source/adapters/hip/queue.cpp @@ -268,11 +268,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( - ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index ba9f3053b9320..5640836cce825 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -259,7 +259,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginGraphCaptureExp = urQueueBeginGraphCaptureExp; pDdiTable->pfnBeginCaptureIntoGraphExp = urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = urQueueIsGraphCaptureEnabledExp; return UR_RESULT_SUCCESS; @@ -489,6 +488,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnGraphExp = urEnqueueGraphExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/queue.cpp b/unified-runtime/source/adapters/level_zero/queue.cpp index d2cef4cc58523..6a18cd1a9f52e 100644 --- a/unified-runtime/source/adapters/level_zero/queue.cpp +++ b/unified-runtime/source/adapters/level_zero/queue.cpp @@ -954,11 +954,11 @@ ur_result_t urQueueEndGraphCaptureExp(ur_queue_handle_t /* hQueue */, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urQueueAppendGraphExp(ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, - uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +ur_result_t urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { UR_LOG_LEGACY(ERR, logger::LegacyMessage("[UR][L0] {} function not implemented!"), "{} function not implemented!", __FUNCTION__); diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index 435e7cb80fa73..cdff8a4f4d3e8 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -235,6 +235,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur::level_zero::urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = ur::level_zero::urEnqueueNativeCommandExp; + pDdiTable->pfnGraphExp = ur::level_zero::urEnqueueGraphExp; return result; } @@ -474,7 +475,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginCaptureIntoGraphExp = ur::level_zero::urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = ur::level_zero::urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = ur::level_zero::urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = ur::level_zero::urQueueIsGraphCaptureEnabledExp; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index c53091fb05eb2..b65fed856f7e0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -853,17 +853,17 @@ ur_result_t urQueueEndGraphCaptureExp(ur_queue_handle_t hQueue, ur_result_t urGraphInstantiateGraphExp(ur_exp_graph_handle_t hGraph, ur_exp_executable_graph_handle_t *phExecGraph); -ur_result_t urQueueAppendGraphExp(ur_queue_handle_t hQueue, - ur_exp_executable_graph_handle_t hGraph, - ur_event_handle_t hSignalEvent, - uint32_t numWaitEvents, - ur_event_handle_t *phWaitEvents); +ur_result_t urEnqueueGraphExp(ur_queue_handle_t hQueue, + ur_exp_executable_graph_handle_t hGraph, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); ur_result_t urGraphDestroyExp(ur_exp_graph_handle_t hGraph); ur_result_t urGraphExecutableGraphDestroyExp( ur_exp_executable_graph_handle_t hExecutableGraph); ur_result_t urQueueIsGraphCaptureEnabledExp(ur_queue_handle_t hQueue, - bool *hResult); -ur_result_t urGraphIsEmptyExp(ur_exp_graph_handle_t hGraph, bool *hResult); + bool *pResult); +ur_result_t urGraphIsEmptyExp(ur_exp_graph_handle_t hGraph, bool *pResult); ur_result_t urGraphDumpContentsExp(ur_exp_graph_handle_t hGraph, const char *filePath); #ifdef UR_STATIC_ADAPTER_LEVEL_ZERO diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 75bfe171e2367..6d7fe1cc99b5a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -496,19 +496,19 @@ ur_result_t urQueueEndGraphCaptureExp(ur_queue_handle_t hQueue, } catch (...) { return exceptionToResult(std::current_exception()); } -ur_result_t urQueueAppendGraphExp(ur_queue_handle_t hQueue, - ur_exp_executable_graph_handle_t hGraph, - ur_event_handle_t hSignalEvent, - uint32_t numWaitEvents, - ur_event_handle_t *phWaitEvents) try { - return hQueue->get().queueAppendGraphExp(hGraph, hSignalEvent, numWaitEvents, - phWaitEvents); +ur_result_t urEnqueueGraphExp(ur_queue_handle_t hQueue, + ur_exp_executable_graph_handle_t hGraph, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { + return hQueue->get().enqueueGraphExp(hGraph, numEventsInWaitList, + phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } ur_result_t urQueueIsGraphCaptureEnabledExp(ur_queue_handle_t hQueue, - bool *hResult) try { - return hQueue->get().queueIsGraphCapteEnabledExp(hResult); + bool *pResult) try { + return hQueue->get().queueIsGraphCapteEnabledExp(pResult); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 87b272cf9d413..06ffc5dbf526f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -181,8 +181,8 @@ struct ur_queue_t_ : ur_queue_extensions { virtual ur_result_t queueBeginGraphCapteExp() = 0; virtual ur_result_t queueBeginCapteIntoGraphExp(ur_exp_graph_handle_t) = 0; virtual ur_result_t queueEndGraphCapteExp(ur_exp_graph_handle_t *) = 0; - virtual ur_result_t queueAppendGraphExp(ur_exp_executable_graph_handle_t, - ur_event_handle_t, uint32_t, - ur_event_handle_t *) = 0; + virtual ur_result_t enqueueGraphExp(ur_exp_executable_graph_handle_t, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t *) = 0; virtual ur_result_t queueIsGraphCapteEnabledExp(bool *) = 0; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp index 86c4e8b740fae..882145cf22996 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp @@ -465,11 +465,10 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_result_t - queueAppendGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, - uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) override { + ur_result_t enqueueGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) override { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index bdaf99d67ff82..6db4993e98b36 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -562,11 +562,10 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_result_t - queueAppendGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, - uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) override { + ur_result_t enqueueGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) override { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index e02f49361eaa8..d771ca8cce79a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -617,11 +617,10 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_result_t - queueAppendGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, - uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) override { + ur_result_t enqueueGraphExp(ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) override { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 56a73ee8af011..85b7e143d15be 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -12653,26 +12653,31 @@ __urdlllocal ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urQueueAppendGraphExp -__urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// @brief Intercept function for urEnqueueGraphExp +__urdlllocal ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) try { + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) try { ur_result_t result = UR_RESULT_SUCCESS; - ur_queue_append_graph_exp_params_t params = {&hQueue, &hGraph, &hSignalEvent, - &numWaitEvents, &phWaitEvents}; + ur_enqueue_graph_exp_params_t params = { + &hQueue, &hGraph, &numEventsInWaitList, &phEventWaitList, &phEvent}; auto beforeCallback = reinterpret_cast( - mock::getCallbacks().get_before_callback("urQueueAppendGraphExp")); + mock::getCallbacks().get_before_callback("urEnqueueGraphExp")); if (beforeCallback) { result = beforeCallback(¶ms); if (result != UR_RESULT_SUCCESS) { @@ -12681,11 +12686,15 @@ __urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( } auto replaceCallback = reinterpret_cast( - mock::getCallbacks().get_replace_callback("urQueueAppendGraphExp")); + mock::getCallbacks().get_replace_callback("urEnqueueGraphExp")); if (replaceCallback) { result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } result = UR_RESULT_SUCCESS; } @@ -12694,7 +12703,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( } auto afterCallback = reinterpret_cast( - mock::getCallbacks().get_after_callback("urQueueAppendGraphExp")); + mock::getCallbacks().get_after_callback("urEnqueueGraphExp")); if (afterCallback) { return afterCallback(¶ms); } @@ -12797,10 +12806,10 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) try { + bool *pResult) try { ur_result_t result = UR_RESULT_SUCCESS; - ur_queue_is_graph_capture_enabled_exp_params_t params = {&hQueue, &hResult}; + ur_queue_is_graph_capture_enabled_exp_params_t params = {&hQueue, &pResult}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -12844,10 +12853,10 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) try { + bool *pResult) try { ur_result_t result = UR_RESULT_SUCCESS; - ur_graph_is_empty_exp_params_t params = {&hGraph, &hResult}; + ur_graph_is_empty_exp_params_t params = {&hGraph, &pResult}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback("urGraphIsEmptyExp")); @@ -13303,6 +13312,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnNativeCommandExp = driver::urEnqueueNativeCommandExp; + pDdiTable->pfnGraphExp = driver::urEnqueueGraphExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); @@ -13803,8 +13814,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnEndGraphCaptureExp = driver::urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = driver::urQueueAppendGraphExp; - pDdiTable->pfnIsGraphCaptureEnabledExp = driver::urQueueIsGraphCaptureEnabledExp; diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index a8999fab9da71..9b5bcc369f37a 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -113,11 +113,12 @@ UR_APIEXPORT ur_result_t urQueueEndGraphCaptureExp( DIE_NO_IMPLEMENTATION; } -UR_APIEXPORT ur_result_t urQueueAppendGraphExp( - ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +UR_APIEXPORT ur_result_t +urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { DIE_NO_IMPLEMENTATION; } diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index fc6e152be2922..139f80475e5dd 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -259,7 +259,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginGraphCaptureExp = urQueueBeginGraphCaptureExp; pDdiTable->pfnBeginCaptureIntoGraphExp = urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = urQueueIsGraphCaptureEnabledExp; return UR_RESULT_SUCCESS; @@ -473,6 +472,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnGraphExp = urEnqueueGraphExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index 0e15159c67e9c..2842b2d686601 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -56,7 +56,7 @@ ur_result_t ur_adapter_handle_t_::init() { ->Devices.push_back( std::make_unique(URPlatform->get(), D)); } - return false; + return true; }, &Adapter->Platforms); diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index 5ebe7170dcc48..0b0d5f4905cbe 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -456,6 +456,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( ImageTarget = UR_DEVICE_BINARY_TARGET_NVPTX64; } else if (Backend == OL_PLATFORM_BACKEND_AMDGPU) { ImageTarget = UR_DEVICE_BINARY_TARGET_AMDGCN; + } else if (Backend == OL_PLATFORM_BACKEND_LEVEL_ZERO) { + ImageTarget = UR_DEVICE_BINARY_TARGET_SPIRV64; } for (uint32_t i = 0; i < NumBinaries; ++i) { diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index fa5246c8a7e58..68bca36bcc02c 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -174,7 +174,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t hProgram, uint32_t, ur_device_handle_t *, - const char *pOptions) { + ur_exp_program_flags_t, const char *pOptions) { // Do nothing, program is built upon creation if (pOptions && *pOptions) { hProgram->Error = "Liboffload doesn't support link options"; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index c5435929dbeb5..4eb9502b59353 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -140,11 +140,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( - ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 0dd1ec836c7fb..eea537b87c610 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -222,7 +222,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginGraphCaptureExp = urQueueBeginGraphCaptureExp; pDdiTable->pfnBeginCaptureIntoGraphExp = urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = urQueueIsGraphCaptureEnabledExp; return UR_RESULT_SUCCESS; } @@ -425,6 +424,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = nullptr; pDdiTable->pfnNativeCommandExp = nullptr; pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnGraphExp = urEnqueueGraphExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/queue.cpp b/unified-runtime/source/adapters/opencl/queue.cpp index eb511a7966229..7d52571a32bd1 100644 --- a/unified-runtime/source/adapters/opencl/queue.cpp +++ b/unified-runtime/source/adapters/opencl/queue.cpp @@ -327,11 +327,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urQueueAppendGraphExp( - ur_queue_handle_t /* hQueue */, - ur_exp_executable_graph_handle_t /* hGraph */, - ur_event_handle_t /* hSignalEvent */, uint32_t /* numWaitEvents */, - ur_event_handle_t * /* phWaitEvents */) { +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueGraphExp(ur_queue_handle_t /* hQueue */, + ur_exp_executable_graph_handle_t /* hGraph */, + uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index d6885f480a57c..ba2221074baa5 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -445,6 +445,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnGraphExp = urEnqueueGraphExp; return UR_RESULT_SUCCESS; } @@ -489,7 +490,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginGraphCaptureExp = urQueueBeginGraphCaptureExp; pDdiTable->pfnBeginCaptureIntoGraphExp = urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = urQueueIsGraphCaptureEnabledExp; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 2dc7e4ada1ace..905381c0a13d4 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -10723,43 +10723,48 @@ __urdlllocal ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urQueueAppendGraphExp -__urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// @brief Intercept function for urEnqueueGraphExp +__urdlllocal ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) { - auto pfnAppendGraphExp = getContext()->urDdiTable.QueueExp.pfnAppendGraphExp; + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnGraphExp = getContext()->urDdiTable.EnqueueExp.pfnGraphExp; - if (nullptr == pfnAppendGraphExp) + if (nullptr == pfnGraphExp) return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - ur_queue_append_graph_exp_params_t params = {&hQueue, &hGraph, &hSignalEvent, - &numWaitEvents, &phWaitEvents}; - uint64_t instance = getContext()->notify_begin( - UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP, "urQueueAppendGraphExp", ¶ms); + ur_enqueue_graph_exp_params_t params = { + &hQueue, &hGraph, &numEventsInWaitList, &phEventWaitList, &phEvent}; + uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ENQUEUE_GRAPH_EXP, + "urEnqueueGraphExp", ¶ms); auto &logger = getContext()->logger; - UR_LOG_L(logger, INFO, " ---> urQueueAppendGraphExp\n"); + UR_LOG_L(logger, INFO, " ---> urEnqueueGraphExp\n"); - ur_result_t result = pfnAppendGraphExp(hQueue, hGraph, hSignalEvent, - numWaitEvents, phWaitEvents); + ur_result_t result = pfnGraphExp(hQueue, hGraph, numEventsInWaitList, + phEventWaitList, phEvent); - getContext()->notify_end(UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP, - "urQueueAppendGraphExp", ¶ms, &result, instance); + getContext()->notify_end(UR_FUNCTION_ENQUEUE_GRAPH_EXP, "urEnqueueGraphExp", + ¶ms, &result, instance); if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_QUEUE_APPEND_GRAPH_EXP, ¶ms); - UR_LOG_L(logger, INFO, " <--- urQueueAppendGraphExp({}) -> {};\n", + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_GRAPH_EXP, + ¶ms); + UR_LOG_L(logger, INFO, " <--- urEnqueueGraphExp({}) -> {};\n", args_str.str(), result); } @@ -10842,14 +10847,14 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto pfnIsGraphCaptureEnabledExp = getContext()->urDdiTable.QueueExp.pfnIsGraphCaptureEnabledExp; if (nullptr == pfnIsGraphCaptureEnabledExp) return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - ur_queue_is_graph_capture_enabled_exp_params_t params = {&hQueue, &hResult}; + ur_queue_is_graph_capture_enabled_exp_params_t params = {&hQueue, &pResult}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_IS_GRAPH_CAPTURE_ENABLED_EXP, "urQueueIsGraphCaptureEnabledExp", ¶ms); @@ -10857,7 +10862,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( auto &logger = getContext()->logger; UR_LOG_L(logger, INFO, " ---> urQueueIsGraphCaptureEnabledExp\n"); - ur_result_t result = pfnIsGraphCaptureEnabledExp(hQueue, hResult); + ur_result_t result = pfnIsGraphCaptureEnabledExp(hQueue, pResult); getContext()->notify_end(UR_FUNCTION_QUEUE_IS_GRAPH_CAPTURE_ENABLED_EXP, "urQueueIsGraphCaptureEnabledExp", ¶ms, &result, @@ -10881,20 +10886,20 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto pfnIsEmptyExp = getContext()->urDdiTable.GraphExp.pfnIsEmptyExp; if (nullptr == pfnIsEmptyExp) return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - ur_graph_is_empty_exp_params_t params = {&hGraph, &hResult}; + ur_graph_is_empty_exp_params_t params = {&hGraph, &pResult}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_GRAPH_IS_EMPTY_EXP, "urGraphIsEmptyExp", ¶ms); auto &logger = getContext()->logger; UR_LOG_L(logger, INFO, " ---> urGraphIsEmptyExp\n"); - ur_result_t result = pfnIsEmptyExp(hGraph, hResult); + ur_result_t result = pfnIsEmptyExp(hGraph, pResult); getContext()->notify_end(UR_FUNCTION_GRAPH_IS_EMPTY_EXP, "urGraphIsEmptyExp", ¶ms, &result, instance); @@ -11459,6 +11464,9 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( dditable.pfnNativeCommandExp = pDdiTable->pfnNativeCommandExp; pDdiTable->pfnNativeCommandExp = ur_tracing_layer::urEnqueueNativeCommandExp; + dditable.pfnGraphExp = pDdiTable->pfnGraphExp; + pDdiTable->pfnGraphExp = ur_tracing_layer::urEnqueueGraphExp; + return result; } /////////////////////////////////////////////////////////////////////////////// @@ -12084,9 +12092,6 @@ __urdlllocal ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnEndGraphCaptureExp = ur_tracing_layer::urQueueEndGraphCaptureExp; - dditable.pfnAppendGraphExp = pDdiTable->pfnAppendGraphExp; - pDdiTable->pfnAppendGraphExp = ur_tracing_layer::urQueueAppendGraphExp; - dditable.pfnIsGraphCaptureEnabledExp = pDdiTable->pfnIsGraphCaptureEnabledExp; pDdiTable->pfnIsGraphCaptureEnabledExp = ur_tracing_layer::urQueueIsGraphCaptureEnabledExp; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index b0245d25daef4..8a5cccca06159 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -11502,22 +11502,27 @@ __urdlllocal ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urQueueAppendGraphExp -__urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// @brief Intercept function for urEnqueueGraphExp +__urdlllocal ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) { - auto pfnAppendGraphExp = getContext()->urDdiTable.QueueExp.pfnAppendGraphExp; + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnGraphExp = getContext()->urDdiTable.EnqueueExp.pfnGraphExp; - if (nullptr == pfnAppendGraphExp) { + if (nullptr == pfnGraphExp) { return UR_RESULT_ERROR_UNINITIALIZED; } @@ -11527,6 +11532,20 @@ __urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( if (NULL == hGraph) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (phEventWaitList == NULL && numEventsInWaitList > 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (phEventWaitList != NULL && numEventsInWaitList == 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -11534,13 +11553,13 @@ __urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( URLOG_CTX_INVALID_REFERENCE(hQueue); } - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hSignalEvent)) { - URLOG_CTX_INVALID_REFERENCE(hSignalEvent); - } + ur_result_t result = pfnGraphExp(hQueue, hGraph, numEventsInWaitList, + phEventWaitList, phEvent); - ur_result_t result = pfnAppendGraphExp(hQueue, hGraph, hSignalEvent, - numWaitEvents, phWaitEvents); + if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && + phEvent) { + getContext()->refCountContext->createRefCount(*phEvent); + } return result; } @@ -11594,7 +11613,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto pfnIsGraphCaptureEnabledExp = getContext()->urDdiTable.QueueExp.pfnIsGraphCaptureEnabledExp; @@ -11603,7 +11622,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( } if (getContext()->enableParameterValidation) { - if (NULL == hResult) + if (NULL == pResult) return UR_RESULT_ERROR_INVALID_NULL_POINTER; if (NULL == hQueue) @@ -11615,7 +11634,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( URLOG_CTX_INVALID_REFERENCE(hQueue); } - ur_result_t result = pfnIsGraphCaptureEnabledExp(hQueue, hResult); + ur_result_t result = pfnIsGraphCaptureEnabledExp(hQueue, pResult); return result; } @@ -11626,7 +11645,7 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto pfnIsEmptyExp = getContext()->urDdiTable.GraphExp.pfnIsEmptyExp; if (nullptr == pfnIsEmptyExp) { @@ -11634,14 +11653,14 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( } if (getContext()->enableParameterValidation) { - if (NULL == hResult) + if (NULL == pResult) return UR_RESULT_ERROR_INVALID_NULL_POINTER; if (NULL == hGraph) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - ur_result_t result = pfnIsEmptyExp(hGraph, hResult); + ur_result_t result = pfnIsEmptyExp(hGraph, pResult); return result; } @@ -12197,6 +12216,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnNativeCommandExp = ur_validation_layer::urEnqueueNativeCommandExp; + dditable.pfnGraphExp = pDdiTable->pfnGraphExp; + pDdiTable->pfnGraphExp = ur_validation_layer::urEnqueueGraphExp; + return result; } @@ -12838,9 +12860,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnEndGraphCaptureExp = ur_validation_layer::urQueueEndGraphCaptureExp; - dditable.pfnAppendGraphExp = pDdiTable->pfnAppendGraphExp; - pDdiTable->pfnAppendGraphExp = ur_validation_layer::urQueueAppendGraphExp; - dditable.pfnIsGraphCaptureEnabledExp = pDdiTable->pfnIsGraphCaptureEnabledExp; pDdiTable->pfnIsGraphCaptureEnabledExp = ur_validation_layer::urQueueIsGraphCaptureEnabledExp; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index e368adca367dd..0435ec9315e1d 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -76,6 +76,7 @@ EXPORTS urEnqueueEventsWait urEnqueueEventsWaitWithBarrier urEnqueueEventsWaitWithBarrierExt + urEnqueueGraphExp urEnqueueKernelLaunch urEnqueueKernelLaunchWithArgsExp urEnqueueMemBufferCopy @@ -304,6 +305,7 @@ EXPORTS urPrintEnqueueEventsWaitParams urPrintEnqueueEventsWaitWithBarrierExtParams urPrintEnqueueEventsWaitWithBarrierParams + urPrintEnqueueGraphExpParams urPrintEnqueueKernelLaunchParams urPrintEnqueueKernelLaunchWithArgsExpParams urPrintEnqueueMemBufferCopyParams @@ -501,7 +503,6 @@ EXPORTS urPrintProgramReleaseParams urPrintProgramRetainParams urPrintProgramSetSpecializationConstantsParams - urPrintQueueAppendGraphExpParams urPrintQueueBeginCaptureIntoGraphExpParams urPrintQueueBeginGraphCaptureExpParams urPrintQueueCreateParams @@ -603,7 +604,6 @@ EXPORTS urProgramRelease urProgramRetain urProgramSetSpecializationConstants - urQueueAppendGraphExp urQueueBeginCaptureIntoGraphExp urQueueBeginGraphCaptureExp urQueueCreate diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index a9d015469ef6a..ac8dcf2522ea8 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -76,6 +76,7 @@ urEnqueueEventsWait; urEnqueueEventsWaitWithBarrier; urEnqueueEventsWaitWithBarrierExt; + urEnqueueGraphExp; urEnqueueKernelLaunch; urEnqueueKernelLaunchWithArgsExp; urEnqueueMemBufferCopy; @@ -304,6 +305,7 @@ urPrintEnqueueEventsWaitParams; urPrintEnqueueEventsWaitWithBarrierExtParams; urPrintEnqueueEventsWaitWithBarrierParams; + urPrintEnqueueGraphExpParams; urPrintEnqueueKernelLaunchParams; urPrintEnqueueKernelLaunchWithArgsExpParams; urPrintEnqueueMemBufferCopyParams; @@ -501,7 +503,6 @@ urPrintProgramReleaseParams; urPrintProgramRetainParams; urPrintProgramSetSpecializationConstantsParams; - urPrintQueueAppendGraphExpParams; urPrintQueueBeginCaptureIntoGraphExpParams; urPrintQueueBeginGraphCaptureExpParams; urPrintQueueCreateParams; @@ -603,7 +604,6 @@ urProgramRelease; urProgramRetain; urProgramSetSpecializationConstants; - urQueueAppendGraphExp; urQueueBeginCaptureIntoGraphExp; urQueueBeginGraphCaptureExp; urQueueCreate; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index c7c60ec879dc9..3a366b990055c 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -6085,29 +6085,34 @@ __urdlllocal ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urQueueAppendGraphExp -__urdlllocal ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// @brief Intercept function for urEnqueueGraphExp +__urdlllocal ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) { + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) { auto *dditable = *reinterpret_cast(hQueue); - auto *pfnAppendGraphExp = dditable->QueueExp.pfnAppendGraphExp; - if (nullptr == pfnAppendGraphExp) + auto *pfnGraphExp = dditable->EnqueueExp.pfnGraphExp; + if (nullptr == pfnGraphExp) return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnAppendGraphExp(hQueue, hGraph, hSignalEvent, numWaitEvents, - phWaitEvents); + return pfnGraphExp(hQueue, hGraph, numEventsInWaitList, phEventWaitList, + phEvent); } /////////////////////////////////////////////////////////////////////////////// @@ -6149,7 +6154,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto *dditable = *reinterpret_cast(hQueue); @@ -6159,7 +6164,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnIsGraphCaptureEnabledExp(hQueue, hResult); + return pfnIsGraphCaptureEnabledExp(hQueue, pResult); } /////////////////////////////////////////////////////////////////////////////// @@ -6168,7 +6173,7 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { auto *dditable = *reinterpret_cast(hGraph); @@ -6177,7 +6182,7 @@ __urdlllocal ur_result_t UR_APICALL urGraphIsEmptyExp( return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnIsEmptyExp(hGraph, hResult); + return pfnIsEmptyExp(hGraph, pResult); } /////////////////////////////////////////////////////////////////////////////// @@ -6641,6 +6646,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur_loader::urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = ur_loader::urEnqueueNativeCommandExp; + pDdiTable->pfnGraphExp = ur_loader::urEnqueueGraphExp; } else { // return pointers directly to platform's DDIs *pDdiTable = @@ -7362,7 +7368,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueExpProcAddrTable( pDdiTable->pfnBeginCaptureIntoGraphExp = ur_loader::urQueueBeginCaptureIntoGraphExp; pDdiTable->pfnEndGraphCaptureExp = ur_loader::urQueueEndGraphCaptureExp; - pDdiTable->pfnAppendGraphExp = ur_loader::urQueueAppendGraphExp; pDdiTable->pfnIsGraphCaptureEnabledExp = ur_loader::urQueueIsGraphCaptureEnabledExp; } else { diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 48366210af215..7e5d6977b619d 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -11018,9 +11018,6 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Create a new record & replay graph instance explicitly. /// -/// @details -/// - Create a new record & replay graph instance explicitly. -/// /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_UNINITIALIZED @@ -11030,8 +11027,6 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphCreateExp( /// [in] Handle of the context object. ur_context_handle_t hContext, @@ -11047,7 +11042,7 @@ ur_result_t UR_APICALL urGraphCreateExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Begin graph capture on the specified immediate queue. +/// @brief Begin graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -11056,8 +11051,6 @@ ur_result_t UR_APICALL urGraphCreateExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue) try { @@ -11073,7 +11066,7 @@ ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Begin capturing commands into an existing graph on the specified -/// immediate queue. +/// queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -11083,8 +11076,6 @@ ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue, @@ -11101,7 +11092,7 @@ ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief End graph capture on the specified immediate queue. +/// @brief End graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -11112,8 +11103,6 @@ ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// [in] Handle of the queue on which to end graph capture. ur_queue_handle_t hQueue, @@ -11143,8 +11132,6 @@ ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phExecGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// [in] Handle of the recorded graph to instantiate. ur_exp_graph_handle_t hGraph, @@ -11161,7 +11148,7 @@ ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Append an executable graph to the queue. +/// @brief Enqueue an executable graph onto the queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -11171,27 +11158,33 @@ ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT -ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) try { - auto pfnAppendGraphExp = - ur_lib::getContext()->urDdiTable.QueueExp.pfnAppendGraphExp; - if (nullptr == pfnAppendGraphExp) + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) try { + auto pfnGraphExp = ur_lib::getContext()->urDdiTable.EnqueueExp.pfnGraphExp; + if (nullptr == pfnGraphExp) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnAppendGraphExp(hQueue, hGraph, hSignalEvent, numWaitEvents, - phWaitEvents); + return pfnGraphExp(hQueue, hGraph, numEventsInWaitList, phEventWaitList, + phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -11208,8 +11201,6 @@ ur_result_t UR_APICALL urQueueAppendGraphExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphDestroyExp( /// [in] Handle of the graph object to destroy. ur_exp_graph_handle_t hGraph) try { @@ -11233,8 +11224,6 @@ ur_result_t UR_APICALL urGraphDestroyExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hExecutableGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// [in] Handle of the executable graph object to destroy. ur_exp_executable_graph_handle_t hExecutableGraph) try { @@ -11259,20 +11248,18 @@ ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT +/// + `NULL == pResult` ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) try { + bool *pResult) try { auto pfnIsGraphCaptureEnabledExp = ur_lib::getContext()->urDdiTable.QueueExp.pfnIsGraphCaptureEnabledExp; if (nullptr == pfnIsGraphCaptureEnabledExp) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnIsGraphCaptureEnabledExp(hQueue, hResult); + return pfnIsGraphCaptureEnabledExp(hQueue, pResult); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -11288,19 +11275,18 @@ ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// + `NULL == pResult` +/// - ::UR_RESULT_ERROR_INVALID_GRAPH ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) try { + bool *pResult) try { auto pfnIsEmptyExp = ur_lib::getContext()->urDdiTable.GraphExp.pfnIsEmptyExp; if (nullptr == pfnIsEmptyExp) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnIsEmptyExp(hGraph, hResult); + return pfnIsEmptyExp(hGraph, pResult); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -11317,9 +11303,6 @@ ur_result_t UR_APICALL urGraphIsEmptyExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == filePath` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urGraphDumpContentsExp( /// [in] Handle of the graph to dump. ur_exp_graph_handle_t hGraph, diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 3f254d6830de7..6fe53ce867ca3 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -1996,6 +1996,15 @@ ur_result_t urPrintEnqueueNativeCommandExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t +urPrintEnqueueGraphExpParams(const struct ur_enqueue_graph_exp_params_t *params, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, @@ -2795,14 +2804,6 @@ ur_result_t urPrintQueueEndGraphCaptureExpParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintQueueAppendGraphExpParams( - const struct ur_queue_append_graph_exp_params_t *params, char *buffer, - const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintQueueIsGraphCaptureEnabledExpParams( const struct ur_queue_is_graph_capture_enabled_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index d3460ba4879ab..6b76c3a1f306d 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -9588,9 +9588,6 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Create a new record & replay graph instance explicitly. /// -/// @details -/// - Create a new record & replay graph instance explicitly. -/// /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_UNINITIALIZED @@ -9600,8 +9597,6 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( /// + `NULL == hContext` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphCreateExp( /// [in] Handle of the context object. ur_context_handle_t hContext, @@ -9612,7 +9607,7 @@ ur_result_t UR_APICALL urGraphCreateExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Begin graph capture on the specified immediate queue. +/// @brief Begin graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9621,8 +9616,6 @@ ur_result_t UR_APICALL urGraphCreateExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue) { @@ -9632,7 +9625,7 @@ ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Begin capturing commands into an existing graph on the specified -/// immediate queue. +/// queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9642,8 +9635,6 @@ ur_result_t UR_APICALL urQueueBeginGraphCaptureExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// [in] Handle of the queue on which to begin graph capture. ur_queue_handle_t hQueue, @@ -9654,7 +9645,7 @@ ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief End graph capture on the specified immediate queue. +/// @brief End graph capture on the specified queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9665,8 +9656,6 @@ ur_result_t UR_APICALL urQueueBeginCaptureIntoGraphExp( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// [in] Handle of the queue on which to end graph capture. ur_queue_handle_t hQueue, @@ -9690,8 +9679,6 @@ ur_result_t UR_APICALL urQueueEndGraphCaptureExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phExecGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// [in] Handle of the recorded graph to instantiate. ur_exp_graph_handle_t hGraph, @@ -9702,7 +9689,7 @@ ur_result_t UR_APICALL urGraphInstantiateGraphExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Append an executable graph to the queue. +/// @brief Enqueue an executable graph onto the queue. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9712,20 +9699,27 @@ ur_result_t UR_APICALL urGraphInstantiateGraphExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT -ur_result_t UR_APICALL urQueueAppendGraphExp( - /// [in] Handle of the queue to append the graph to. +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +ur_result_t UR_APICALL urEnqueueGraphExp( + /// [in] Handle of the queue to which the graph will be enqueued. ur_queue_handle_t hQueue, - /// [in] Handle of the executable graph to append. + /// [in] Handle of the executable graph to be enqueued. ur_exp_executable_graph_handle_t hGraph, - /// [in][optional] Event to be signaled on completion. - ur_event_handle_t hSignalEvent, /// [in][optional] Number of events to wait on before executing. - uint32_t numWaitEvents, - /// [in][optional][range(0, numWaitEvents)] Handle of the events to wait - /// on before launching. - ur_event_handle_t *phWaitEvents) { + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] Pointer to a list of + /// events that must be complete before this command can be executed. + /// If nullptr, the numEventsInWaitList must be 0, indicating that this + /// command does not wait on any event to complete. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] Event object that identifies this particular + /// command instance. + /// If phEventWaitList and phEvent are not nullptr, phEvent must not refer + /// to an element of the phEventWaitList array. + ur_event_handle_t *phEvent) { ur_result_t result = UR_RESULT_SUCCESS; return result; } @@ -9742,8 +9736,6 @@ ur_result_t UR_APICALL urQueueAppendGraphExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphDestroyExp( /// [in] Handle of the graph object to destroy. ur_exp_graph_handle_t hGraph) { @@ -9762,8 +9754,6 @@ ur_result_t UR_APICALL urGraphDestroyExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hExecutableGraph` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// [in] Handle of the executable graph object to destroy. ur_exp_executable_graph_handle_t hExecutableGraph) { @@ -9782,14 +9772,12 @@ ur_result_t UR_APICALL urGraphExecutableGraphDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_ARGUMENT +/// + `NULL == pResult` ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// [in] Native queue to query. ur_queue_handle_t hQueue, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { ur_result_t result = UR_RESULT_SUCCESS; return result; } @@ -9805,14 +9793,13 @@ ur_result_t UR_APICALL urQueueIsGraphCaptureEnabledExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == hResult` -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// + `NULL == pResult` +/// - ::UR_RESULT_ERROR_INVALID_GRAPH ur_result_t UR_APICALL urGraphIsEmptyExp( /// [in] Handle of the graph to query. ur_exp_graph_handle_t hGraph, /// [out] Pointer to a boolean where the result will be stored. - bool *hResult) { + bool *pResult) { ur_result_t result = UR_RESULT_SUCCESS; return result; } @@ -9829,9 +9816,6 @@ ur_result_t UR_APICALL urGraphIsEmptyExp( /// + `NULL == hGraph` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == filePath` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urGraphDumpContentsExp( /// [in] Handle of the graph to dump. ur_exp_graph_handle_t hGraph, diff --git a/unified-runtime/test/fuzz/urFuzz.cpp b/unified-runtime/test/fuzz/urFuzz.cpp index 89007cddd8e3f..21ef7af138450 100644 --- a/unified-runtime/test/fuzz/urFuzz.cpp +++ b/unified-runtime/test/fuzz/urFuzz.cpp @@ -402,7 +402,7 @@ int ur_program_create_with_il(TestState &state) { const size_t lWorkSize[] = {1, 1, 1}; urEnqueueKernelLaunch(queue, kernel, nDim, gWorkOffset, gWorkSize, lWorkSize, - 0, nullptr, &event); + nullptr, 1, &event, nullptr); urEventWait(1, &event); urEventRelease(event);