From 7e9590a173cfa65f6aff2d12be6a2b4d5d91ae24 Mon Sep 17 00:00:00 2001 From: BO SANG Date: Thu, 12 Jun 2025 20:21:43 -0700 Subject: [PATCH 1/3] add hpu --- xpu_timer/xpu_timer/hpu/BUILD.bazel | 42 ++ xpu_timer/xpu_timer/hpu/hook.cc | 519 ++++++++++++++++ xpu_timer/xpu_timer/hpu/hook.h | 196 ++++++ xpu_timer/xpu_timer/hpu/hpu_dtype_util.cc | 94 +++ xpu_timer/xpu_timer/hpu/hpu_dtype_util.h | 31 + xpu_timer/xpu_timer/hpu/hpu_timer.cc | 709 ++++++++++++++++++++++ xpu_timer/xpu_timer/hpu/hpu_timer.h | 272 +++++++++ xpu_timer/xpu_timer/hpu/only_keep_hpu.lds | 28 + 8 files changed, 1891 insertions(+) create mode 100644 xpu_timer/xpu_timer/hpu/BUILD.bazel create mode 100644 xpu_timer/xpu_timer/hpu/hook.cc create mode 100644 xpu_timer/xpu_timer/hpu/hook.h create mode 100644 xpu_timer/xpu_timer/hpu/hpu_dtype_util.cc create mode 100644 xpu_timer/xpu_timer/hpu/hpu_dtype_util.h create mode 100644 xpu_timer/xpu_timer/hpu/hpu_timer.cc create mode 100644 xpu_timer/xpu_timer/hpu/hpu_timer.h create mode 100644 xpu_timer/xpu_timer/hpu/only_keep_hpu.lds diff --git a/xpu_timer/xpu_timer/hpu/BUILD.bazel b/xpu_timer/xpu_timer/hpu/BUILD.bazel new file mode 100644 index 0000000000..3ac08a2ec0 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/BUILD.bazel @@ -0,0 +1,42 @@ +load("@rules_python//python:defs.bzl", "py_binary") +load("//:workspace.bzl", "xpu_cc_binary", "xpu_cc_library") + +package(default_visibility = ["//visibility:public"]) + +exports_files(["only_keep_hpu.lds"]) + +xpu_cc_library( + name = "hpu_timer", + srcs = [ + "hpu_dtype_util.cc", + "hpu_timer.cc", + ], + hdrs = [ + "hpu_dtype_util.h", + "hpu_timer.h", + ], + deps = [ + "//xpu_timer/common:xpu_timer", + "//xpu_timer/protos:cc_hook_proto", + ], +) + +xpu_cc_library( + name = "hpu_hook", + srcs = [ + "hook.cc", + ], + hdrs = [ + "hook.h", + ], + deps = [ + "//xpu_timer/common:macro", + "//xpu_timer/common:manager", + "//xpu_timer/common:util", + "@hpu//:ascendcl", + "@hpu//:hccl", + "@hpu//:opapi", + ], + # force to keep all syms, if not, linker will remove useless symbols. + alwayslink = True, +) diff --git a/xpu_timer/xpu_timer/hpu/hook.cc b/xpu_timer/xpu_timer/hpu/hook.cc new file mode 100644 index 0000000000..25ef2310b4 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hook.cc @@ -0,0 +1,519 @@ +#include "xpu_timer/hpu/hook.h" + +#include +#include +#include + +#include "xpu_timer/common/constant.h" +#include "xpu_timer/common/manager.h" +#include "xpu_timer/common/util.h" +#include "xpu_timer/hpu/hpu_timer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* (*dlsymFn)(void* handle, const char* name); + +void* dlsym(void* handle, const char* name) { + static dlsymFn real_dlsym = NULL; + static bool hook_matmul = xpu_timer::util::EnvVarRegistry::GetEnvVar( + "XPU_TIMER_HPU_HOOK_MATMUL"); + if (real_dlsym == NULL) { + // dlvsym(), provided by glibc since version 2.1, does the same as dlsym() + // but takes a version string as an additional argument. + // the version is from `readelf -a -W libdl.so | grep dlsym` + real_dlsym = (dlsymFn)dlvsym(RTLD_NEXT, "dlsym", "GLIBC_2.2.5"); + // To get the newest version, as well as a potentially one of another + // interceptor in the same process, to do an unversioned query again: + // https://stackoverflow.com/questions/15599026/how-can-i-intercept-dlsym-calls-using-ld-preload + real_dlsym = (dlsymFn)real_dlsym(RTLD_NEXT, "dlsym"); + } + if (hook_matmul) { + if (!strcmp(name, "aclnnMatmulGetWorkspaceSize")) { + SETUP_DLSYM_WITH_HPU_OPAPI(aclnnMatmulGetWorkspaceSize); + return (void*)aclnnMatmulGetWorkspaceSize; + } else if (!strcmp(name, "aclnnMatmul")) { + SETUP_DLSYM_WITH_HPU_OPAPI(aclnnMatmul); + return (void*)aclnnMatmul; + } else if (!(strcmp(name, "aclnnGroupedMatmulV2GetWorkspaceSize"))) { + SETUP_DLSYM_WITH_HPU_OPAPI(aclnnGroupedMatmulV2GetWorkspaceSize); + return (void*)aclnnGroupedMatmulV2GetWorkspaceSize; + } else if (!strcmp(name, "aclnnGroupedMatmulV2")) { + SETUP_DLSYM_WITH_HPU_OPAPI(aclnnGroupedMatmulV2); + return (void*)aclnnGroupedMatmulV2; + } + } + if (!strcmp(name, "HcclAlltoAllV")) { + SETUP_DLSYM_WITH_HPU_HCCL_REAL(HcclAlltoAllV); + return (void*)HcclAlltoAllV; + } else if (!strcmp(name, "HcclReduce")) { + SETUP_DLSYM_WITH_HPU_HCCL_REAL(HcclReduce); + return (void*)HcclReduce; + } else if (!strcmp(name, "HcclScatter")) { + SETUP_DLSYM_WITH_HPU_HCCL_REAL(HcclScatter); + return (void*)HcclScatter; + } else if (!strcmp(name, "HcclBatchSendRecv")) { + SETUP_DLSYM_WITH_HPU_HCCL_REAL(HcclBatchSendRecv); + return (void*)HcclBatchSendRecv; + } else if (!strcmp(name, "HcclAlltoAll")) { + SETUP_DLSYM_WITH_HPU_HCCL_REAL(HcclAlltoAll); + return (void*)HcclAlltoAll; + } + return real_dlsym(handle, name); +} + +aclnnStatus aclnnMatmulGetWorkspaceSize(const aclTensor* self, + const aclTensor* other, aclTensor* out, + int8_t cubeMathType, + uint64_t* workspace_size, + aclOpExecutor** executor) { + aclnnStatus ret_status = orig_aclnnMatmulGetWorkspaceSize( + self, other, out, cubeMathType, workspace_size, executor); + + if (!::xpu_timer::util::config::GlobalConfig::enable) { + return ret_status; + } + xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.interceptMatmulInfo(self, other, executor); + return ret_status; +} + +aclnnStatus aclnnMatmul(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream) { + if (!::xpu_timer::util::config::GlobalConfig::enable) { + return orig_aclnnMatmul(workspace, workspaceSize, executor, stream); + } + + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleMatmul(executor); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + event->reset(stream, fn, xpu_timer::constant::Metrics::MatmulMetrics::TYPE); + + auto ret_status = + orig_aclnnMatmul(workspace, workspaceSize, executor, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + return ret_status; +} + +// aclnnGroupedMatmulV2 +aclnnStatus aclnnGroupedMatmulV2GetWorkspaceSize( + const aclTensorList* x, const aclTensorList* weight, + const aclTensorList* biasOptional, const aclTensorList* scaleOptional, + const aclTensorList* offsetOptional, + const aclTensorList* antiquantScaleOptional, + const aclTensorList* antiquantOffsetOptional, + const aclIntArray* groupListOptional, int64_t splitItem, int64_t groupType, + const aclTensorList* y, uint64_t* workspaceSize, aclOpExecutor** executor) { + aclnnStatus ret_status = orig_aclnnGroupedMatmulV2GetWorkspaceSize( + x, weight, biasOptional, scaleOptional, offsetOptional, + antiquantScaleOptional, antiquantOffsetOptional, groupListOptional, + splitItem, groupType, y, workspaceSize, executor); + if (!::xpu_timer::util::config::GlobalConfig::enable) { + return ret_status; + } + + xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.interceptGroupedMatmulV2Info( + x, weight, groupListOptional, splitItem, groupType, executor); + return ret_status; +} + +aclnnStatus aclnnGroupedMatmulV2(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream) { + if (!::xpu_timer::util::config::GlobalConfig::enable) { + return orig_aclnnGroupedMatmulV2(workspace, workspaceSize, executor, + stream); + } + + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleGroupedMatmulV2(executor); + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + event->reset(stream, fn, xpu_timer::constant::Metrics::MatmulMetrics::TYPE); + + auto ret_status = + orig_aclnnGroupedMatmulV2(workspace, workspaceSize, executor, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + return ret_status; +} + +HcclResult HcclAllReduce(void* sendBuf, void* recvBuf, uint64_t count, + HcclDataType dataType, HcclReduceOp op, HcclComm comm, + aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclAllReduce); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclAllReduce(sendBuf, recvBuf, count, dataType, op, comm, + stream); + } + + std::string func_name = "HcclAllReduce"; + std::string coll_type = "AllReduce"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(count, dataType, comm, func_name, + coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = + orig_HcclAllReduce(sendBuf, recvBuf, count, dataType, op, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} +HcclResult HcclBroadcast(void* buf, uint64_t count, HcclDataType dataType, + uint32_t root, HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclBroadcast); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclBroadcast(buf, count, dataType, root, comm, stream); + } + + std::string func_name = "HcclBroadcast"; + std::string coll_type = "Broadcast"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(count, dataType, comm, func_name, + coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclBroadcast(buf, count, dataType, root, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclReduceScatter(void* sendBuf, void* recvBuf, uint64_t recvCount, + HcclDataType dataType, HcclReduceOp op, + HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclReduceScatter); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclReduceScatter(sendBuf, recvBuf, recvCount, dataType, op, + comm, stream); + } + + std::string func_name = "HcclReduceScatter"; + std::string coll_type = "ReduceScatter"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(recvCount, dataType, comm, + func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclReduceScatter(sendBuf, recvBuf, recvCount, dataType, + op, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclReduce(void* sendBuf, void* recvBuf, uint64_t count, + HcclDataType dataType, HcclReduceOp op, uint32_t root, + HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclReduce); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclReduce(sendBuf, recvBuf, count, dataType, op, root, comm, + stream); + } + + std::string func_name = "HcclReduce"; + std::string coll_type = "Reduce"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(count, dataType, comm, func_name, + coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclReduce(sendBuf, recvBuf, count, dataType, op, root, + comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclAlltoAll(const void* sendBuf, uint64_t sendCount, + HcclDataType sendType, const void* recvBuf, + uint64_t recvCount, HcclDataType recvType, + HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclAlltoAll); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclAlltoAll(sendBuf, sendCount, sendType, recvBuf, recvCount, + recvType, comm, stream); + } + + std::string func_name = "HcclAlltoAll"; + std::string coll_type = "AlltoAll"; + // TODO(jingjun): calculate all to all + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(recvCount, recvType, comm, + func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclAlltoAll(sendBuf, sendCount, sendType, recvBuf, + recvCount, recvType, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclAlltoAllV(const void* sendBuf, const void* sendCounts, + const void* sdispls, HcclDataType sendType, + const void* recvBuf, const void* recvCounts, + const void* rdispls, HcclDataType recvType, + HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclAlltoAllV); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclAlltoAllV(sendBuf, sendCounts, sdispls, sendType, recvBuf, + recvCounts, rdispls, recvType, comm, stream); + } + + std::string func_name = "HcclAlltoAllV"; + std::string coll_type = "AlltoAllV"; + // TODO(jingjun): calculate all to all v + auto fn = + xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(*((uint64_t*)recvCounts), recvType, + comm, func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = + orig_HcclAlltoAllV(sendBuf, sendCounts, sdispls, sendType, recvBuf, + recvCounts, rdispls, recvType, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclAllGather(void* sendBuf, void* recvBuf, uint64_t sendCount, + HcclDataType dataType, HcclComm comm, + aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclAllGather); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclAllGather(sendBuf, recvBuf, sendCount, dataType, comm, + stream); + } + + std::string func_name = "HcclAllGather"; + std::string coll_type = "AllGather"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(sendCount, dataType, comm, + func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + HcclResult retResult = + orig_HcclAllGather(sendBuf, recvBuf, sendCount, dataType, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclBarrier(HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclBarrier); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclBarrier(comm, stream); + } + + std::string func_name = "HcclBarrier"; + std::string coll_type = "Barrier"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(1, HCCL_DATA_TYPE_FP32, comm, + func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclBarrier(comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclScatter(void* sendBuf, void* recvBuf, uint64_t recvCount, + HcclDataType dataType, uint32_t root, HcclComm comm, + aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclScatter); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclScatter(sendBuf, recvBuf, recvCount, dataType, root, comm, + stream); + } + + std::string func_name = "HcclScatter"; + std::string coll_type = "Scatter"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(recvCount, dataType, comm, + func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclScatter(sendBuf, recvBuf, recvCount, dataType, root, + comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +HcclResult HcclSend(void* sendBuf, uint64_t count, HcclDataType dataType, + uint32_t destRank, HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclSend); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclSend(sendBuf, count, dataType, destRank, comm, stream); + } + + std::string func_name = "HcclSend"; + std::string coll_type = "Send"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(count, dataType, comm, func_name, + coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = + orig_HcclSend(sendBuf, count, dataType, destRank, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} +HcclResult HcclRecv(void* recvBuf, uint64_t count, HcclDataType dataType, + uint32_t srcRank, HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclRecv); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclRecv(recvBuf, count, dataType, srcRank, comm, stream); + } + + std::string func_name = "HcclRecv"; + std::string coll_type = "Recv"; + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(count, dataType, comm, func_name, + coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = + orig_HcclRecv(recvBuf, count, dataType, srcRank, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} +HcclResult HcclBatchSendRecv(HcclSendRecvItem* sendRecvInfo, uint32_t itemNum, + HcclComm comm, aclrtStream stream) { + SETUP_DLSYM_WITH_HPU_HCCL(HcclBatchSendRecv); + + if (!::xpu_timer::util::config::GlobalConfig::enable || isNranksLE1(comm)) { + return orig_HcclBatchSendRecv(sendRecvInfo, itemNum, comm, stream); + } + + std::string func_name = "HcclBatchSendRecv"; + std::string coll_type = "BatchSendRecv"; + // TODO(jingjun): support batched send recv + auto fn = xpu_timer::GpuTimerManager::getInstance() + .intercept_manager.handleHccl(itemNum, HCCL_DATA_TYPE_FP32, + comm, func_name, coll_type); + + auto event = + xpu_timer::GpuTimerManager::getInstance() + .getEvent(); + + event->reset(stream, fn, xpu_timer::constant::Metrics::CollMetrics::TYPE); + + auto retResult = orig_HcclBatchSendRecv(sendRecvInfo, itemNum, comm, stream); + + xpu_timer::GpuTimerManager::getInstance() + .recordEvent(event); + + return retResult; +} + +bool isNranksLE1(HcclComm comm) { + uint32_t nranks; + HcclGetRankSize(comm, &nranks); + return nranks <= 1; +} +#ifdef __cplusplus +} +#endif diff --git a/xpu_timer/xpu_timer/hpu/hook.h b/xpu_timer/xpu_timer/hpu/hook.h new file mode 100644 index 0000000000..494223f0a8 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hook.h @@ -0,0 +1,196 @@ +#pragma once +#include + +#include "xpu_timer/common/macro.h" +#include "xpu_timer/common/platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// dlsym +EXPOSE_API +void* dlsym(void* handle, const char* name); + +// aclnnMatmul +typedef aclnnStatus (*aclnnMatmulFn)(void*, uint64_t, aclOpExecutor*, + aclrtStream); + +static aclnnMatmulFn orig_aclnnMatmul = NULL; + +typedef aclnnStatus (*aclnnMatmulGetWorkspaceSizeFn)( + const aclTensor* self, const aclTensor* other, aclTensor* out, + int8_t cubeMathType, uint64_t* workspace_size, aclOpExecutor** executor); + +static aclnnMatmulGetWorkspaceSizeFn orig_aclnnMatmulGetWorkspaceSize = NULL; + +EXPOSE_API aclnnStatus aclnnMatmulGetWorkspaceSize( + const aclTensor* self, const aclTensor* other, aclTensor* out, + int8_t cubeMathType, uint64_t* workspace_size, aclOpExecutor** executor); + +EXPOSE_API aclnnStatus aclnnMatmul(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream); +// aclnnGroupedMatmulV2 +typedef aclnnStatus (*aclnnGroupedMatmulV2GetWorkspaceSizeFn)( + const aclTensorList* x, const aclTensorList* weight, + const aclTensorList* biasOptional, const aclTensorList* scaleOptional, + const aclTensorList* offsetOptional, + const aclTensorList* antiquantScaleOptional, + const aclTensorList* antiquantOffsetOptional, + const aclIntArray* groupListOptional, int64_t splitItem, int64_t groupType, + const aclTensorList* y, uint64_t* workspaceSize, aclOpExecutor** executor); + +typedef aclnnStatus (*aclnnGroupedMatmulV2Fn)(void* workspace, + uint64_t workspaceSize, + aclOpExecutor* executor, + aclrtStream stream); + +static aclnnGroupedMatmulV2GetWorkspaceSizeFn + orig_aclnnGroupedMatmulV2GetWorkspaceSize = NULL; +static aclnnGroupedMatmulV2Fn orig_aclnnGroupedMatmulV2 = NULL; + +EXPOSE_API aclnnStatus aclnnGroupedMatmulV2GetWorkspaceSize( + const aclTensorList* x, const aclTensorList* weight, + const aclTensorList* biasOptional, const aclTensorList* scaleOptional, + const aclTensorList* offsetOptional, + const aclTensorList* antiquantScaleOptional, + const aclTensorList* antiquantOffsetOptional, + const aclIntArray* groupListOptional, int64_t splitItem, int64_t groupType, + const aclTensorList* y, uint64_t* workspaceSize, aclOpExecutor** executor); +EXPOSE_API aclnnStatus aclnnGroupedMatmulV2(void* workspace, + uint64_t workspaceSize, + aclOpExecutor* executor, + aclrtStream stream); + +// HcclAllReduce +typedef HcclResult (*HcclAllReduceFn)(void* sendBuf, void* recvBuf, + uint64_t count, HcclDataType dataType, + HcclReduceOp op, HcclComm comm, + aclrtStream stream); +static HcclAllReduceFn orig_HcclAllReduce = NULL; + +EXPOSE_API +HcclResult HcclAllReduce(void* sendBuf, void* recvBuf, uint64_t count, + HcclDataType dataType, HcclReduceOp op, HcclComm comm, + aclrtStream stream); + +// HcclBroadcast +typedef HcclResult (*HcclBroadcastFn)(void* buf, uint64_t count, + HcclDataType dataType, uint32_t root, + HcclComm comm, aclrtStream stream); +static HcclBroadcastFn orig_HcclBroadcast = NULL; +EXPOSE_API +HcclResult HcclBroadcast(void* buf, uint64_t count, HcclDataType dataType, + uint32_t root, HcclComm comm, aclrtStream stream); + +// HcclReduceScatter +typedef HcclResult (*HcclReduceScatterFn)(void* sendBuf, void* recvBuf, + uint64_t recvCount, + HcclDataType dataType, + HcclReduceOp op, HcclComm comm, + aclrtStream stream); +static HcclReduceScatterFn orig_HcclReduceScatter = NULL; +EXPOSE_API +HcclResult HcclReduceScatter(void* sendBuf, void* recvBuf, uint64_t recvCount, + HcclDataType dataType, HcclReduceOp op, + HcclComm comm, aclrtStream stream); + +// HcclReduce +typedef HcclResult (*HcclReduceFn)(void* sendBuf, void* recvBuf, uint64_t count, + HcclDataType dataType, HcclReduceOp op, + uint32_t root, HcclComm comm, + aclrtStream stream); +static HcclReduceFn orig_HcclReduce = NULL; +EXPOSE_API +HcclResult HcclReduce(void* sendBuf, void* recvBuf, uint64_t count, + HcclDataType dataType, HcclReduceOp op, uint32_t root, + HcclComm comm, aclrtStream stream); + +// HcclAlltoAll +typedef HcclResult (*HcclAlltoAllFn)(const void* sendBuf, uint64_t sendCount, + HcclDataType sendType, const void* recvBuf, + uint64_t recvCount, HcclDataType recvType, + HcclComm comm, aclrtStream stream); +static HcclAlltoAllFn orig_HcclAlltoAll = NULL; +EXPOSE_API +HcclResult HcclAlltoAll(const void* sendBuf, uint64_t sendCount, + HcclDataType sendType, const void* recvBuf, + uint64_t recvCount, HcclDataType recvType, + HcclComm comm, aclrtStream stream); + +// HcclAlltoAllV +typedef HcclResult (*HcclAlltoAllVFn)( + const void* sendBuf, const void* sendCounts, const void* sdispls, + HcclDataType sendType, const void* recvBuf, const void* recvCounts, + const void* rdispls, HcclDataType recvType, HcclComm comm, + aclrtStream stream); + +static HcclAlltoAllVFn orig_HcclAlltoAllV = NULL; +EXPOSE_API +HcclResult HcclAlltoAllV(const void* sendBuf, const void* sendCounts, + const void* sdispls, HcclDataType sendType, + const void* recvBuf, const void* recvCounts, + const void* rdispls, HcclDataType recvType, + HcclComm comm, aclrtStream stream); + +// HcclAllGather +typedef HcclResult (*HcclAllGatherFn)(void*, void*, uint64_t, HcclDataType, + HcclComm, aclrtStream); +static HcclAllGatherFn orig_HcclAllGather = NULL; + +EXPOSE_API +HcclResult HcclAllGather(void* sendBuf, void* recvBuf, uint64_t sendCount, + HcclDataType dataType, HcclComm comm, + aclrtStream stream); + +// HcclBarrier +typedef HcclResult (*HcclBarrierFn)(HcclComm comm, aclrtStream stream); + +static HcclBarrierFn orig_HcclBarrier = NULL; +EXPOSE_API +HcclResult HcclBarrier(HcclComm comm, aclrtStream stream); + +// HcclScatter +typedef HcclResult (*HcclScatterFn)(void* sendBuf, void* recvBuf, + uint64_t recvCount, HcclDataType dataType, + uint32_t root, HcclComm comm, + aclrtStream stream); +static HcclScatterFn orig_HcclScatter = NULL; +EXPOSE_API +HcclResult HcclScatter(void* sendBuf, void* recvBuf, uint64_t recvCount, + HcclDataType dataType, uint32_t root, HcclComm comm, + aclrtStream stream); + +// HcclSend +typedef HcclResult (*HcclSendFn)(void* sendBuf, uint64_t count, + HcclDataType dataType, uint32_t destRank, + HcclComm comm, aclrtStream stream); +static HcclSendFn orig_HcclSend = NULL; +EXPOSE_API +HcclResult HcclSend(void* sendBuf, uint64_t count, HcclDataType dataType, + uint32_t destRank, HcclComm comm, aclrtStream stream); + +// HcclRecv +typedef HcclResult (*HcclRecvFn)(void* recvBuf, uint64_t count, + HcclDataType dataType, uint32_t srcRank, + HcclComm comm, aclrtStream stream); +static HcclRecvFn orig_HcclRecv = NULL; +EXPOSE_API +HcclResult HcclRecv(void* recvBuf, uint64_t count, HcclDataType dataType, + uint32_t srcRank, HcclComm comm, aclrtStream stream); + +// HcclBatchSendRecv +typedef HcclResult (*HcclBatchSendRecvFn)(HcclSendRecvItem* sendRecvInfo, + uint32_t itemNum, HcclComm comm, + aclrtStream stream); +static HcclBatchSendRecvFn orig_HcclBatchSendRecv = NULL; +EXPOSE_API +HcclResult HcclBatchSendRecv(HcclSendRecvItem* sendRecvInfo, uint32_t itemNum, + HcclComm comm, aclrtStream stream); + +// judge the rank of comm is <= 1 +bool isNranksLE1(HcclComm comm); + +#ifdef __cplusplus +} +#endif diff --git a/xpu_timer/xpu_timer/hpu/hpu_dtype_util.cc b/xpu_timer/xpu_timer/hpu/hpu_dtype_util.cc new file mode 100644 index 0000000000..4a41c69b3d --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hpu_dtype_util.cc @@ -0,0 +1,94 @@ + +#include "xpu_timer/hpu/hpu_dtype_util.h" + +namespace xpu_timer { +namespace hpu { +const std::string HpuDataTypeUtils::UNKNOWN_ACL_DTYPE = "UNKNOWN"; +std::string HpuDataTypeUtils::gpu_ = ""; + +const std::unordered_map + HpuDataTypeUtils::aclDataTypeToStringMap = { + {ACL_FLOAT, "float32"}, {ACL_FLOAT16, "float16"}, + {ACL_INT8, "int8"}, {ACL_UINT8, "uint8"}, + {ACL_INT16, "int16"}, {ACL_UINT16, "uint16"}, + {ACL_UINT32, "uint32"}, {ACL_INT64, "int64"}, + {ACL_UINT64, "uint64"}, {ACL_DOUBLE, "double"}, + {ACL_BOOL, "bool"}, {ACL_STRING, "string"}, + {ACL_COMPLEX64, "complex64"}, {ACL_COMPLEX128, "complex128"}, + {ACL_BF16, "bfloat16"}, {ACL_INT4, "int4"}, + {ACL_UINT1, "uint1"}, {ACL_COMPLEX32, "complex32"}, +}; + +const std::unordered_map + HpuDataTypeUtils::hcclDataTypeToStringMap = { + {HCCL_DATA_TYPE_INT8, "int8"}, /* int8 */ + {HCCL_DATA_TYPE_INT16, "int16"}, /* int16 */ + {HCCL_DATA_TYPE_INT32, "int32"}, /* int32 */ + {HCCL_DATA_TYPE_FP16, "float16"}, /* fp16 */ + {HCCL_DATA_TYPE_FP32, "float32"}, /* fp32 */ + {HCCL_DATA_TYPE_INT64, "int64"}, /* int64 */ + {HCCL_DATA_TYPE_UINT64, "uint64"}, /* uint64 */ + {HCCL_DATA_TYPE_UINT8, "uint8"}, /* uint8 */ + {HCCL_DATA_TYPE_UINT16, "uint16"}, /* uint16 */ + {HCCL_DATA_TYPE_UINT32, "uint32"}, /* uint32 */ + {HCCL_DATA_TYPE_FP64, "float64"}, /* fp64 */ + {HCCL_DATA_TYPE_BFP16, "bfloat16"}, /* bfp16 */ + {HCCL_DATA_TYPE_INT128, "int128"}, /* int128 */ +}; + +const std::unordered_map + HpuDataTypeUtils::dtypeSizeInBytes = { + {"int8", 1}, {"int16", 2}, {"int32", 4}, {"float16", 2}, + {"float32", 4}, {"int64", 8}, {"uint64", 8}, {"uint8", 1}, + {"uint16", 2}, {"uint32", 4}, {"float64", 8}, {"bfloat16", 2}, + {"int128", 16}, {UNKNOWN_ACL_DTYPE, 0}, +}; + +const std::unordered_map> + HpuDataTypeUtils::gpuHardwareFlops = { + {"910B", + { + {"fp16", 376.350}, + {"bf16", 364.928}, + {"fp32", 99.559}, + }}, +}; +const std::string& HpuDataTypeUtils::getAclDtype(aclDataType dtype) { + auto it = aclDataTypeToStringMap.find(dtype); + return it == aclDataTypeToStringMap.end() ? UNKNOWN_ACL_DTYPE : it->second; +} + +uint64_t HpuDataTypeUtils::getDtypeSizeInBytes(const std::string& dtype) { + auto it = dtypeSizeInBytes.find(dtype); + return it == dtypeSizeInBytes.end() ? 0 : it->second; +} + +void HpuDataTypeUtils::setGpu(const std::string& gpu) { gpu_ = gpu; } + +double HpuDataTypeUtils::getGpuHardwareFlops(const std::string& dtype) { + static const std::unordered_map* gpu_ptr = nullptr; + if (!gpu_ptr) { + auto it = gpuHardwareFlops.find(gpu_); + if (it != gpuHardwareFlops.end()) { + gpu_ptr = &it->second; + } else { + gpu_ptr = &gpuHardwareFlops.at("910B"); + } + } + + auto it = gpu_ptr->find(dtype); + if (it != gpu_ptr->end()) { + return it->second; + } + // 910B half + return 376.250; +} + +const std::string& HpuDataTypeUtils::getHcclDataType( + const HcclDataType& dtype) { + auto it = hcclDataTypeToStringMap.find(dtype); + return it == hcclDataTypeToStringMap.end() ? UNKNOWN_ACL_DTYPE : it->second; +} + +} // namespace hpu +} // namespace xpu_timer diff --git a/xpu_timer/xpu_timer/hpu/hpu_dtype_util.h b/xpu_timer/xpu_timer/hpu/hpu_dtype_util.h new file mode 100644 index 0000000000..1bc3520dfe --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hpu_dtype_util.h @@ -0,0 +1,31 @@ +#pragma once +#include + +#include "xpu_timer/common/platform.h" + +namespace xpu_timer { +namespace hpu { + +class HpuDataTypeUtils { + public: + static const std::string UNKNOWN_ACL_DTYPE; + + static const std::string& getAclDtype(aclDataType dtype); + static const std::string& getHcclDataType(const HcclDataType& dtype); + static uint64_t getDtypeSizeInBytes(const std::string& dtype); + static double getGpuHardwareFlops(const std::string& dtype); + static void setGpu(const std::string& gpu); + + private: + static const std::unordered_map + aclDataTypeToStringMap; + static const std::unordered_map + hcclDataTypeToStringMap; + static const std::unordered_map dtypeSizeInBytes; + static const std::unordered_map> + gpuHardwareFlops; + static std::string gpu_; +}; +} // namespace hpu +} // namespace xpu_timer diff --git a/xpu_timer/xpu_timer/hpu/hpu_timer.cc b/xpu_timer/xpu_timer/hpu/hpu_timer.cc new file mode 100644 index 0000000000..af91ea50b6 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hpu_timer.cc @@ -0,0 +1,709 @@ +#include "xpu_timer/hpu/hpu_timer.h" + +#include + +#include +#include + +#include "xpu_timer/common/logging.h" +#include "xpu_timer/common/util.h" + +namespace xpu_timer { +namespace hpu { + +/* + * =================================== + * Class impl of EventStartTimeHelper + * =================================== + */ + +EventStartTimeHelper::EventStartTimeHelper(aclrtStream s) : stream_(s) { + int rank = util::EnvVarRegistry::GetEnvVar("LOCAL_RANK"); + aclrtSetDevice(rank); + auto status = aclrtCreateEvent(&start_event_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "create event err: " << aclGetRecentErrMsg(); + } +} + +void EventStartTimeHelper::reset() { + auto status = aclrtRecordEvent(start_event_, stream_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "record event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } + status = aclrtSynchronizeEvent(start_event_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "sync event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } + std::chrono::time_point start = + std::chrono::system_clock::now(); + cpu_time_ = std::chrono::duration_cast( + start.time_since_epoch()) + .count(); +} + +time_t EventStartTimeHelper::getTime(aclrtEvent start_launch_event, + bool* is_validate_to_trace) { + float elapsed_time; // ms + aclrtEventElapsedTime(&elapsed_time, start_event_, start_launch_event); + double es = ((double)elapsed_time * 1000); // ms->us + // Skip events that begin before the EventStartTimeHelper. + *is_validate_to_trace = (es > 0); + return cpu_time_ + time_t(es); +} + +/* + * =================================== + * Class impl of HpuTimer + * =================================== + * =================================== + * Static variables + * =================================== + */ + +std::unordered_map HpuTimer::hccl_seq_num{}; +std::unordered_map HpuTimer::tracing_metas_{}; +std::unordered_map HpuTimer::trace_id_counter_{}; +std::unordered_map + HpuTimer::stream_timer_helper_{}; +int HpuTimer::kernel_encoding_counter_(0); + +/* + * =================================== + * Interface Overrides + * =================================== + * The following member functions are + * overrides from the base interface. + * =================================== + */ +const std::string_view& HpuTimer::getType() { return type_; } + +const uint64_t HpuTimer::getProblemSize() { return problem_size_; } + +void HpuTimer::startRecord() { + auto status = aclrtRecordEvent(start_event_, stream_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "record event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } +} + +void HpuTimer::endRecord() { + if (is_host_) { + finish_time_ = std::chrono::system_clock::now(); + return; + } + auto status = aclrtRecordEvent(stop_event_, stream_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "record event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } +} + +int HpuTimer::getTraceCode() { return trace_code_; } + +uint64_t HpuTimer::getTraceId() { return trace_id_; } + +const std::string HpuTimer::getName() { return name_; } + +bool HpuTimer::isReady() { + if (is_host_) return true; + aclrtEventRecordedStatus eventStatus; + auto status = aclrtQueryEventStatus(stop_event_, &eventStatus); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "aclrtQueryEventStatus err: " << aclGetRecentErrMsg(); + } + + bool ready = (eventStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE); + if (!ready) { + hang_counter_ += 1; + } else { + // After the HostEvent is ready, wait for the DeviceEvent + // for 5 minutes, and also return ready=True. This is to prevent any + // potential random hang issues. + auto status = aclrtSynchronizeEventWithTimeout(stop_event_, 300000); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "aclrtSynchronizeEventWithTimeout err: " + << aclGetRecentErrMsg(); + } + } + return ready; +} + +time_t HpuTimer::getExecuteTimeStamp() { + auto execute_time = stream_timer_helper_[stream_]->getTime( + start_event_, &(this->is_validate_to_trace_)); + return execute_time; +} + +time_t HpuTimer::getLaunchTimeStamp() { return launch_time_timestamp_; } + +uint64_t HpuTimer::getDuration() { + if (is_host_) { + auto dur_us = std::chrono::duration_cast( + finish_time_ - launch_time_); + return dur_us.count(); + } + float elapsed_time; // ms + auto status = aclrtEventElapsedTime(&elapsed_time, start_event_, stop_event_); + if (status != ACL_ERROR_NONE) { + elapsed_time = 0.0; + XLOG(ERROR) << "aclrtEventElapsedTime event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } + return uint64_t(elapsed_time * 1000); // ms -> us +} + +void HpuTimer::reBuild() { name_ = rebuild_cb_(); } + +Labels HpuTimer::getExtraLabels() { return extra_labels_; } + +bool HpuTimer::isHang(time_t timeout) { + if (hang_counter_ < hang_counter_estimator_) return false; + hang_counter_ = 0; + + static const std::chrono::seconds timeout_second = + std::chrono::seconds(timeout); + if (std::chrono::system_clock::now() - launch_time_for_hang_ > + timeout_second) { + launch_time_for_hang_ = std::chrono::system_clock::now(); + return true; + } + return false; +} + +bool HpuTimer::ignoreHang() { return is_barrier_; } + +bool HpuTimer::isHost() { return is_host_; } + +/* + * =================================== + * Static Overload + * =================================== + * The following member functions are + * overload from the base interface. + * =================================== + */ + +void HpuTimer::doPrepareForDumpTrace() { + for (auto it : stream_timer_helper_) it.second->reset(); +} + +void HpuTimer::doPrepare() { + std::string device_name_from_env = + util::EnvVarRegistry::GetEnvVar("XPU_TIMER_DEVICE_NAME"); + + // TODO(jingjun.lc): support device + // if (device_name_from_env != util::EnvVarRegistry::STRING_DEFAULT_VALUE) { + // XLOG(INFO) << "Device type set to " << device_name_from_env; + // HpuDataTypeUtils::setGpu(device_name_from_env); + // } else { + // HpuDataTypeUtils::setGpu(platform::getDeviceName()); + // } +} + +void HpuTimer::dumpTraceMeta(const std::string& path, + const std::vector& extra) { + if (util::ensureDirExists(path)) { + XLOG(ERROR) << "Could not create dir for timeline.meta"; + return; + } + std::filesystem::path dir(path); + std::filesystem::path file_path = + dir / util::getUniqueFileNameByCluster(".timeline.meta"); + ; + std::ostringstream oss; + for (const auto& it : tracing_metas_) { + oss << it.second << "," << it.first << std::endl; + } + for (const auto& it : extra) { + oss << "xpu_timer_host_trace," << it << std::endl; + } + std::ofstream file(file_path); + if (!file) { + XLOG(ERROR) << "Could not create dir for timeline.meta"; + return; + } + file << oss.str(); +} + +std::string HpuTimer::collBucketFn(double performance, uint64_t problem_size, + Labels* label) { + if (problem_size <= 8192) { // 8192 bits + (*label)["small"] = "1"; + } else { + (*label)["small"] = "0"; + } + double throughtput_gbps = performance; + int level = + int(throughtput_gbps * config::BvarMetricsConfig::comm_bucket_count / + config::BvarMetricsConfig::nic_bandwidth_gbps); // bucketing up to + // 400Gbps + std::string string_level; + std::string bucket_name; + if (level > config::BvarMetricsConfig::comm_bucket_count) { + level = config::BvarMetricsConfig::comm_bucket_count; + string_level = std::to_string(config::BvarMetricsConfig::comm_bucket_count); + } else { + string_level = std::to_string(level + 1); + } + (*label)["level"] = string_level; + static std::string coll_p = + std::string(constant::Metrics::CollMetrics::BUCKET_NAME); + bucket_name = + coll_p + string_level + (*label)["operation"] + (*label)["algorithm"]; + return bucket_name; +}; + +std::string HpuTimer::matmulBucketFn(double peformance, uint64_t problem_size, + Labels* label) { + double tflops = peformance; + int level = getMatmulBucket( + tflops * config::BvarMetricsConfig::mm_bucket_count, (*label)["dtype"]); + std::string string_level; + std::string bucket_name; + if (level > config::BvarMetricsConfig::mm_bucket_count) { + level = config::BvarMetricsConfig::mm_bucket_count; + // tflops of flash_attn may higher than hardware tflops, add it to extra + // level + string_level = + std::to_string(config::BvarMetricsConfig::mm_bucket_count + 1); + } else { + string_level = std::to_string(level + 1); + } + (*label)["level"] = string_level; + + static std::string compute_p = + std::string(constant::Metrics::MatmulMetrics::BUCKET_NAME); + bucket_name = compute_p + string_level + (*label)["operation"]; + return bucket_name; +}; + +/* + * =================================== + * Public methods + * =================================== + */ + +void HpuTimer::reset_cb(std::function cb) { + auto rebuild_fn = [this, cb]() -> auto { + auto tup = cb(this); + std::string name; + std::tie(name, problem_size_, extra_labels_) = tup; + + if (tracing_metas_.find(name) == tracing_metas_.end()) { + tracing_metas_[name] = kernel_encoding_counter_++; + } + + trace_code_ = tracing_metas_[name]; + trace_id_ = ++trace_id_counter_[trace_code_]; + + if (is_host_) { + finish_time_ = std::chrono::system_clock::now(); + } else { + auto it = stream_timer_helper_.find(stream_); + if (it == stream_timer_helper_.end()) { + stream_timer_helper_.emplace(stream_, + new EventStartTimeHelper(stream_)); + } + } + + launch_time_timestamp_ = + std::chrono::duration_cast( + launch_time_.time_since_epoch()) + .count(); + is_barrier_ = false; + // is allreduce and reduce size is 8 bits, dtype is at::kByte, alias to + // ncclUint8 + auto it_label = extra_labels_.find("operation"); + if (it_label != extra_labels_.end()) + is_barrier_ = false; + else if (problem_size_ == 8 && it_label->second == "AllReduce") + is_barrier_ = true; + is_validate_to_trace_ = true; + return name; + }; + rebuild_cb_ = rebuild_fn; + hang_counter_ = 0; +} + +void HpuTimer::reset(aclrtStream s, + std::function cb, + const std::string_view& type) { + reset_stream(s, type); + reset_cb(cb); +} + +// void HpuTimer::reset(std::function cb, +// const std::string_view& type) { +// is_host_ = true; +// type_ = type; +// launch_time_ = std::chrono::system_clock::now(); +// reset_cb(cb); +// } + +void HpuTimer::reset_stream(aclrtStream s, const std::string_view& type) { + is_host_ = false; + stream_ = s; + // get timestamp of kernel launch + launch_time_ = std::chrono::system_clock::now(); + launch_time_for_hang_ = std::chrono::system_clock::now(); + // record event of kernel launch + startRecord(); + // assign is light, capture and copy object into closure is heavier. + type_ = type; +} + +bool HpuTimer::isValidateToTrace() { return is_validate_to_trace_; } + +/* + * =================================== + * Class impl of InterceptManager + * =================================== + */ +void InterceptManager::interceptMatmulInfo(const aclTensor* self, + const aclTensor* other, + aclOpExecutor** executor) { + void* exe_addr = (void*)(*executor); + int64_t* self_dims; + uint64_t self_dims_num; + aclDataType self_datatype; + aclGetViewShape(self, &self_dims, &self_dims_num); + aclGetDataType(self, &self_datatype); + int64_t* other_dims; + uint64_t other_dims_num; + aclGetViewShape(other, &other_dims, &other_dims_num); + + auto get_bs_from_ND = [](uint64_t dims_num, int64_t* dims) -> auto { + int64_t bs = 1; + for (uint64_t i = 0; i < dims_num - 2; ++i) { + bs *= dims[i]; + } + return bs; + }; + int64_t b = get_bs_from_ND(self_dims_num, self_dims); + std::array matmul_bmnk = {b, *(self_dims + self_dims_num - 2), + *(other_dims + other_dims_num - 1), + *(self_dims + self_dims_num - 1)}; + // XLOG(INFO) << "insert matmul info: " << exe_addr << " " << matmul_bmnk[0] + // << matmul_bmnk[1] << matmul_bmnk[2] << matmul_bmnk[3]; + matmul_info_map_[exe_addr] = std::make_shared( + exe_addr, matmul_bmnk, self_datatype, "aclnnMatmul", + "xpu_timer_aclnnMatmul_"); +} + +std::function InterceptManager::handleMatmul( + aclOpExecutor* executor) { + auto matmul_info = [&]() -> auto { + void* exe_addr = (void*)(executor); + auto it = matmul_info_map_.find(exe_addr); + if (it != matmul_info_map_.end()) { + auto matmul_info_ptr = it->second; + // XLOG(INFO) << "delete matmul info: " << exe_addr; + // matmul_info_map_.erase(it); + return matmul_info_ptr; + } else { + return std::make_shared( + exe_addr, std::array({1, 1, 1, 1}), ACL_FLOAT, + "aclnnMatmul", "xpu_timer_aclnnMatmul_"); + } + }(); + auto fn = [matmul_info](HpuTimer* timer) -> auto { + std::ostringstream oss; + timer->trace->Clear(); + timer->trace->set_kernel_type( + constant::Metrics::MatmulMetrics::KERNEL_TYPE); + + hook::MatmulDebugData* mm_debug = timer->trace->mutable_mm_debug(); + + std::string compute_dtype = + HpuDataTypeUtils::getAclDtype(matmul_info->dtype_); + mm_debug->set_dtype(compute_dtype); + mm_debug->set_api("aclnnMatmul"); + + oss << "aclnnMatmul"; + uint64_t flop = 2; + for (const auto& v : matmul_info->bmnk_) { + oss << v << "_"; + mm_debug->add_shapes(v); + flop = flop * v; + } + + return std::make_tuple( + oss.str(), flop, + xpu_timer::Labels{{"dtype", compute_dtype}, {"operation", "Matmul"}}); + }; + return fn; +} + +void InterceptManager::interceptGroupedMatmulV2Info( + const aclTensorList* x, const aclTensorList* weight, + const aclIntArray* groupListOptional, int64_t splitItem, int64_t groupType, + aclOpExecutor** executor) { + void* exe_addr = (void*)(*executor); + auto tensor_is_transpose = [](const aclTensor& tensor) -> auto { + uint64_t strides_num; + int64_t* strides_value; + + aclGetViewStrides(&tensor, &strides_value, &strides_num); + return strides_value[strides_num - 1] != 1; + }; + auto get_tensor_list_shape_fn = + [&](const aclTensorList& tensor_list) -> auto { + uint64_t tensor_list_size; + aclGetTensorListSize(&tensor_list, &tensor_list_size); + + // {pair.first = dims number, pair.second = dims} + std::vector> tensor_shape_list; + for (uint64_t i = 0; i < tensor_list_size; ++i) { + int64_t* dims; + uint64_t dims_num; + aclGetViewShape(tensor_list[i], &dims, &dims_num); + const uint64_t LAST_2_DIM = dims_num - 2; + const uint64_t LAST_DIM = dims_num - 1; + if (tensor_is_transpose(*tensor_list[i])) { + std::swap(dims[LAST_2_DIM], dims[LAST_DIM]); + }; + tensor_shape_list.push_back({dims_num, dims}); + } + return tensor_shape_list; + }; + + aclDataType datatype; + aclGetDataType((*x)[0], &datatype); + + auto x_shape_list = get_tensor_list_shape_fn(*x); + auto weight_shape_list = get_tensor_list_shape_fn(*weight); + + std::vector> bmnks; + auto get_bs_from_ND = [](uint64_t dims_num, int64_t* dims) -> auto { + int64_t bs = 1; + const uint64_t LAST_2_DIM = dims_num - 2; + for (uint64_t i = 0; i < LAST_2_DIM; ++i) { + bs *= dims[i]; + } + return bs; + }; + if (groupType == -1) { + for (size_t i = 0; i < x_shape_list.size(); ++i) { + const uint64_t x_dims_num = x_shape_list[i].first; + const int64_t* x_dims = x_shape_list[i].second; + const uint64_t weight_dims_num = weight_shape_list[i].first; + const int64_t* weight_dims = weight_shape_list[i].second; + const uint64_t x_LAST_2_DIM = x_dims_num - 2; + const uint64_t x_LAST_DIM = x_dims_num - 1; + const uint64_t weight_LAST_DIM = weight_dims_num - 1; + + const int64_t b = + get_bs_from_ND(x_shape_list[i].first, x_shape_list[i].second); + const int64_t m = x_dims[x_LAST_2_DIM]; + const int64_t k = x_dims[x_LAST_DIM]; + const int64_t n = weight_dims[weight_LAST_DIM]; + bmnks.push_back({b, m, n, k}); + } + } else if (groupType == 0) { + if (x_shape_list.size() > 1 && weight_shape_list.size() > 1 && + (splitItem == 2 || splitItem == 3)) { + const uint64_t weight_dims_num = weight_shape_list[0].first; + const int64_t* weight_dims = weight_shape_list[0].second; + const uint64_t weight_LAST_DIM = weight_dims_num - 1; + const int64_t n = weight_dims[weight_LAST_DIM]; + + uint64_t groupListOptional_size; + aclGetIntArraySize(groupListOptional, &groupListOptional_size); + int64_t pre = 0; + for (uint64_t i = 0; i < groupListOptional_size; ++i) { + const uint64_t x_dims_num = x_shape_list[i].first; + const int64_t* x_dims = x_shape_list[i].second; + const uint64_t x_LAST_DIM = x_dims_num - 1; + const int64_t k = x_dims[x_LAST_DIM]; + const int64_t m = (*groupListOptional)[i] - pre; + pre = (*groupListOptional)[i]; + bmnks.push_back({1, m, n, k}); + } + } else if (x_shape_list.size() == 1 && (splitItem == 2 || splitItem == 3)) { + const uint64_t x_dims_num = x_shape_list[0].first; + const int64_t* x_dims = x_shape_list[0].second; + const uint64_t x_LAST_DIM = x_dims_num - 1; + const int k = x_dims[x_LAST_DIM]; + const uint64_t weight_dims_num = weight_shape_list[0].first; + const int64_t* weight_dims = weight_shape_list[0].second; + const uint64_t weight_LAST_DIM = weight_dims_num - 1; + const int n = weight_dims[weight_LAST_DIM]; + + uint64_t groupListOptional_size; + aclGetIntArraySize(groupListOptional, &groupListOptional_size); + int64_t pre = 0; + for (uint64_t i = 0; i < groupListOptional_size; ++i) { + const int64_t m = (*groupListOptional)[i] - pre; + pre = (*groupListOptional)[i]; + bmnks.push_back({1, m, n, k}); + } + } else if (x_shape_list.size() == 1 && (splitItem == 0 || splitItem == 1)) { + uint64_t groupListOptional_size; + + aclGetIntArraySize(groupListOptional, &groupListOptional_size); + int64_t pre = 0; + const uint64_t x_dims_num = x_shape_list[0].first; + const int64_t* x_dims = x_shape_list[0].second; + const uint64_t x_LAST_DIM = x_dims_num - 1; + const int k = x_dims[x_LAST_DIM]; + + for (uint64_t i = 0; i < groupListOptional_size; ++i) { + const int64_t* weight_dims = weight_shape_list[i].second; + const uint64_t weight_dims_num = weight_shape_list[i].first; + const int64_t m = (*groupListOptional)[i] - pre; + const uint64_t weight_LAST_DIM = weight_dims_num - 1; + const int n = weight_dims[weight_LAST_DIM]; + pre = (*groupListOptional)[i]; + bmnks.push_back({1, m, n, k}); + } + } else { + XLOG(INFO) << "Unspported input arguments: x.shape.size = " + << x_shape_list.size() + << "weight.shape.size = " << weight_shape_list.size() + << " groupType " << groupType << " splitItem " << splitItem; + bmnks.push_back({1, 1, 1, 1}); + } + } else if (groupType == 2) { + const uint64_t x_dims_num = x_shape_list[0].first; + const int64_t* x_dims = x_shape_list[0].second; + const uint64_t x_LAST_2_DIM = x_dims_num - 2; + const int m = x_dims[x_LAST_2_DIM]; + const uint64_t weight_dims_num = weight_shape_list[0].first; + const int64_t* weight_dims = weight_shape_list[0].second; + const uint64_t weight_LAST_DIM = weight_dims_num - 1; + const int n = weight_dims[weight_LAST_DIM]; + + uint64_t groupListOptional_size; + aclGetIntArraySize(groupListOptional, &groupListOptional_size); + int64_t pre = 0; + for (uint64_t i = 0; i < groupListOptional_size; ++i) { + const int64_t k = (*groupListOptional)[i] - pre; + pre = (*groupListOptional)[i]; + bmnks.push_back({1, m, n, k}); + } + } else { + XLOG(INFO) << "Unspported GroupType: groupType = " << groupType; + bmnks.push_back({1, 1, 1, 1}); + } + grouped_matmul_info_map_[exe_addr] = + std::make_shared( + exe_addr, bmnks, datatype, "aclnnGroupedMatmulV2", + "xpu_timer_aclnnGroupedMatmulV2_"); +} + +std::function +InterceptManager::handleGroupedMatmulV2(aclOpExecutor* executor) { + auto grouped_matmul_info = [&]() -> auto { + void* exe_addr = (void*)(executor); + auto it = grouped_matmul_info_map_.find(exe_addr); + if (it != grouped_matmul_info_map_.end()) { + auto grouped_matmul_info_ptr = it->second; + // grouped_matmul_info_map_.erase(it); + return grouped_matmul_info_ptr; + } else { + return std::make_shared( + exe_addr, std::vector({std::array({1, 1, 1, 1})}), + ACL_FLOAT, "aclnnGroupedMatmulV2", "xpu_timer_aclnnGroupedMatmulV2_"); + } + }(); + auto fn = [grouped_matmul_info](HpuTimer* timer) -> auto { + std::ostringstream oss; + timer->trace->Clear(); + timer->trace->set_kernel_type( + constant::Metrics::MatmulMetrics::KERNEL_TYPE); + + hook::GroupedMatmulDebugData* grouped_mm_debug = + timer->trace->mutable_grouped_mm_debug(); + + std::string compute_dtype = + HpuDataTypeUtils::getAclDtype(grouped_matmul_info->dtype_); + grouped_mm_debug->set_dtype(compute_dtype); + grouped_mm_debug->set_api("aclnnGroupedMatmulV2"); + + oss << "GroupedMatmul"; + + uint64_t flop = 0; + for (const auto& bmnk : grouped_matmul_info->bmnks_) { + // hook::GroupedMatmulDebugData::BMNK* bmnks = + // grouped_mm_debug->add_bmnks(); + uint64_t cur_flop = 2; + for (const auto& v : bmnk) { + // bmnks->add_shapes(v); + cur_flop = cur_flop * v; + } + flop += cur_flop; + } + grouped_mm_debug->set_tflops(flop); + + return std::make_tuple(oss.str(), flop, + xpu_timer::Labels{{"dtype", compute_dtype}, + {"operation", "GroupedMatmul"}}); + }; + return fn; +} +std::function InterceptManager::handleHccl( + uint64_t count, HcclDataType datatype, HcclComm& comm, + const std::string& func_name, const std::string& coll_type) { + auto fn = [count, datatype, comm, func_name, + coll_type](HpuTimer* timer) -> auto { + std::ostringstream oss; + timer->trace->Clear(); + timer->trace->set_kernel_type(constant::Metrics::CollMetrics::KERNEL_TYPE); + hook::NcclDebugData* hccl_debug = timer->trace->mutable_nccl_debug(); + + std::string dtype = HpuDataTypeUtils::getHcclDataType(datatype); + uint64_t comm_size = count * HpuDataTypeUtils::getDtypeSizeInBytes(dtype); + + oss << "xpu_timer_" << func_name << "_size_" << comm_size; + // cann-hccl/src/domain/collective_communication/framework/inc/topoinfo_struct.h + constexpr uint32_t ROOTINFO_INDENTIFIER_MAX_LENGTH = 128; + char commName[ROOTINFO_INDENTIFIER_MAX_LENGTH]; + HcclGetCommName(comm, commName); + std::string commNameStr(commName); + // TODO(jingjun): use global map to get commHash, only calculate once + uint64_t commHash = std::hash{}(commNameStr); + + auto HPU_CLUSTER_CARD_NUMBER = util::config::GlobalConfig::local_world_size; + uint32_t nranks; + HcclGetRankSize(comm, &nranks); + uint32_t nNodes = + (nranks + HPU_CLUSTER_CARD_NUMBER - 1) / HPU_CLUSTER_CARD_NUMBER; + + hccl_debug->set_comm_hash(commHash); + hccl_debug->set_input_size_in_bytes(comm_size); + hccl_debug->set_dtype(dtype); + hccl_debug->set_ranks(nranks); + hccl_debug->set_nodes(nNodes); + hccl_debug->set_seq(++(timer->hccl_seq_num[commHash])); + + double factor = 1.0; + if (coll_type == "AllReduce") { + factor = 2.0 * (nranks - 1) / nranks; + } else if (coll_type == "AllGather" || coll_type == "ReduceScatter") { + // input of reduce_scatter/allgather is sharded, so we do not device + // world_size + factor = static_cast(nranks - 1); + } + + uint64_t problem_size = static_cast(factor * comm_size); + uint64_t problem_size_bits = problem_size * 8; + + hccl_debug->set_problem_size(problem_size); + + return std::make_tuple( + oss.str(), problem_size_bits, + xpu_timer::Labels{ + {"dtype", dtype}, + {"operation", coll_type}, + {"algorithm", "NotKown"}, + {"transport", nNodes > 1 ? "InterNode" : "IntraNode"}}); + }; + return fn; +} + +} // namespace hpu +} // namespace xpu_timer diff --git a/xpu_timer/xpu_timer/hpu/hpu_timer.h b/xpu_timer/xpu_timer/hpu/hpu_timer.h new file mode 100644 index 0000000000..17b030fe55 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/hpu_timer.h @@ -0,0 +1,272 @@ +#pragma once + +#include +#include +#include +#include + +#include "xpu_timer/common/logging.h" +#include "xpu_timer/common/platform.h" +#include "xpu_timer/common/xpu_timer.h" +#include "xpu_timer/hpu/hpu_dtype_util.h" +#include "xpu_timer/protos/hook.pb.h" +namespace xpu_timer { +namespace hpu { +class InterceptManager; +namespace config = xpu_timer::util::config; +class EventStartTimeHelper { + /* Using for getting when does the `torch kernel` real running. Record helper + event on + * this stream and synchronize it, and then get cpu time immediately, so we + can use cpu time + * to approximate time of helper event, and finally we can get time of `torch + kernel`. + * + * kernel start time = cpu time + (kernel launch event - helper start event) + * + cpu time + │ + start event │ torch kernel + synchronize time──────┐ │ elspesd time + │ │ ────────── + reset helper ────┐ │ │ / \ + start event │ │ │ / \ + ▼ ▼ ▼ / \ + ───────────────────────────────────────────────────► time + / /▲ ▲ + /___________/ │ │ + / │ │ + / kernel kernel + elspsed time / launch event end event + between helper event + and torch kernel + */ + + public: + EventStartTimeHelper(aclrtStream s); + void reset(); + time_t getTime(aclrtEvent kernel_launch_start, bool* is_validate_to_trace); + + private: + // start event on this stream + aclrtEvent start_event_; + aclrtEvent stop_event_; + // time in us + time_t cpu_time_; + aclrtStream stream_; +}; + +class HpuTimer : public XpuTimer { + /* Use cuda event to timing kernel. */ + public: + using InnerInterceptManager = InterceptManager; + using FnReturn = std::tuple; + + explicit HpuTimer() { + auto status = aclrtCreateEvent(&start_event_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "record event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } + status = aclrtCreateEvent(&stop_event_); + if (status != ACL_ERROR_NONE) { + XLOG(ERROR) << "record event err: " << aclGetRecentErrMsg() + << " stream: " << stream_; + } + hang_counter_ = 0; + trace = new hook::KernelTrace(); // HpuTimer is pooling, trace should + // never free. + } + + /* + * =================================== + * Interface Overrides + * =================================== + * overrides from the XpuTimer + * =================================== + */ + void startRecord() override; + void endRecord() override; + bool isReady() override; + void reBuild() override; + uint64_t getDuration() override; + const std::string getName() override; + const std::string_view& getType() override; + const uint64_t getProblemSize() override; + time_t getExecuteTimeStamp() override; + time_t getLaunchTimeStamp() override; + int getTraceCode() override; + uint64_t getTraceId() override; + Labels getExtraLabels() override; + bool isHang(time_t timeout) override; + bool ignoreHang() override; + bool isHost() override; + bool isValidateToTrace() override; + + /* + * =================================== + * Static overload + * =================================== + * Overload from the XpuTimer + * =================================== + */ + static void doPrepare(); // parse nccl syms if needed. + // dump the trace meta, mapping from trace_code -> kernel_name + static void dumpTraceMeta(const std::string& path, + const std::vector& extra); + static void + doPrepareForDumpTrace(); // reset start timer on each stream, it use to get + // timestamp for kernel when is running on GPU. + + /* + * =================================== + * public methods and vars + * =================================== + */ + + void reset(aclrtStream s, std::function cb, + const std::string_view& type); + + // kernel trace object + hook::KernelTrace* trace; + + // seqnum for each hccl comm, key is comm hash, value is auto inc from 0 + static std::unordered_map hccl_seq_num; + + static double mmPerformance(uint64_t latency_in_us, uint64_t problem_size) { + return (double)(problem_size) / latency_in_us / 1e6; // Tflops + }; + + static double collPerformance(uint64_t latency_in_us, uint64_t problem_size) { + return (double)(problem_size) / 1e3 / latency_in_us; // Gbps + } + + static std::string collBucketFn(double performance, uint64_t problem_size, + Labels* label); + + static int getMatmulBucket(double performance, const std::string& dtype) { + return (int)(performance / HpuDataTypeUtils::getGpuHardwareFlops(dtype)); + } + + static std::string matmulBucketFn(double peformance, uint64_t problem_size, + Labels* label); + + private: + void reset_cb(std::function cb); + void reset_stream(aclrtStream s, const std::string_view& type); + + aclrtEvent start_event_, stop_event_; // owned + aclrtStream stream_; // not owned + // return kernel name, it's callback function and called in background thread, + // be careful of the lifetime of object in closure. + std::function rebuild_cb_; + // kernel type, current is batched matmul, matmul, coll + std::string_view type_; + // kernel name with params concat together. + std::string name_; + // record when does kernel launch or host call + time_t launch_time_timestamp_; + std::chrono::time_point launch_time_; + // counting down for detecting kernel hang + std::chrono::time_point launch_time_for_hang_; + + // for matmul/fa, is tflop, for communication, is Gbits + uint64_t problem_size_; + // labels for prometheus. + Labels extra_labels_; + + // id of this kernel with param, it's used as KernelTrace::trace_code + int trace_code_; + + // auto incremented counter by name, it use to generate code in + // KernelTrace::trace_code + static std::unordered_map tracing_metas_; + // global counter for encoding kernel name with params to int. + static int kernel_encoding_counter_; + // record launch count for each kernel, it's useful for compare kernel + // cross differnts rank, it will be arg in chrome trace json file. + // key is from tracing_metas_, value is count. + static std::unordered_map trace_id_counter_; + // trace id for each type of kernel + uint64_t trace_id_; + // each stream has a helper to get kernel real running time, + static std::unordered_map + stream_timer_helper_; + // hang counter, poller interval is 100us, hang counter added to + // 10000, we check the timeout timestamp. + uint64_t hang_counter_; + constexpr static uint64_t hang_counter_estimator_ = 10000; + // is torch.dist.barrier op + bool is_barrier_; + // is on host + bool is_host_; + std::chrono::time_point finish_time_; + // Some events do not need to be traced, such as events that begin before the + // EventStartTimeHelper. + bool is_validate_to_trace_; +}; + +class InterceptManager { + private: + struct MatmulInfo { + void* executor_addr_; + std::array bmnk_; + aclDataType dtype_; + std::string api_; + std::string name_prefix_; + + public: + MatmulInfo(void* executor_addr_, std::array bmnk_, + aclDataType dtype_, std::string api_, std::string name_prefix_) + : executor_addr_(executor_addr_), + bmnk_(bmnk_), + dtype_(dtype_), + api_(api_), + name_prefix_(name_prefix_) {} + }; + struct GroupedMamtulInfo { + void* executor_addr_; + std::vector> bmnks_; + aclDataType dtype_; + std::string api_; + std::string name_prefix_; + + public: + GroupedMamtulInfo(void* executor_addr_, + std::vector> bmnks_, + aclDataType dtype_, std::string api_, + std::string name_prefix_) + : executor_addr_(executor_addr_), + bmnks_(bmnks_), + dtype_(dtype_), + api_(api_), + name_prefix_(name_prefix_) {} + }; + std::unordered_map> + matmul_info_map_; + std::unordered_map> + grouped_matmul_info_map_; + + public: + // matmul info + void interceptMatmulInfo(const aclTensor* self, const aclTensor* other, + aclOpExecutor** executor); + std::function handleMatmul( + aclOpExecutor* executor); + std::function handleHccl( + uint64_t count, HcclDataType datatype, HcclComm& comm, + const std::string& func_name, const std::string& coll_type); + + // grouped matmul info + void interceptGroupedMatmulV2Info(const aclTensorList* x, + const aclTensorList* weight, + const aclIntArray* groupListOptional, + int64_t splitItem, int64_t groupType, + aclOpExecutor** executor); + + std::function handleGroupedMatmulV2( + aclOpExecutor* executor); +}; +} // namespace hpu +} // namespace xpu_timer diff --git a/xpu_timer/xpu_timer/hpu/only_keep_hpu.lds b/xpu_timer/xpu_timer/hpu/only_keep_hpu.lds new file mode 100644 index 0000000000..3afe0e43a5 --- /dev/null +++ b/xpu_timer/xpu_timer/hpu/only_keep_hpu.lds @@ -0,0 +1,28 @@ +# -fvisibility=hidden can not hidden those symbols, may be declaration with extern C or visibility sets to default. +# bthread_jump_fcontext +# bthread_make_fcontext +# _ZN6google8protobuf8internal8byteswapILi1EEEvPv +# _ZN6google8protobuf8internal8byteswapILi4EEEvPv +# _ZN6google8protobuf8internal8byteswapILi8EEEvPv +{ + global: + dlsym; + aclnnMatmulGetWorkspaceSize; + aclnnMatmul; + aclnnGroupedMatmulV2GetWorkspaceSize; + aclnnGroupedMatmulV2; + HcclAllReduce; + HcclBroadcast; + HcclReduceScatter; + HcclReduce; + HcclAlltoAll; + HcclAlltoAllV; + HcclAllGather; + HcclBarrier; + HcclScatter; + HcclSend; + HcclRecv; + HcclBatchSendRecv; + local: + *; +}; From 6e247852904640025d439601455d4ce1d9b78cb0 Mon Sep 17 00:00:00 2001 From: BO SANG Date: Thu, 10 Jul 2025 20:36:33 -0700 Subject: [PATCH 2/3] sync --- xpu_timer/config_subparsers/hpu.py | 67 +++++++++++++++++++++++++++ xpu_timer/xpu_timer/hpu/hpu_timer.cc | 10 ++-- xpu_timer/xpu_timer/protos/hook.proto | 29 ++++++++---- 3 files changed, 93 insertions(+), 13 deletions(-) create mode 100644 xpu_timer/config_subparsers/hpu.py diff --git a/xpu_timer/config_subparsers/hpu.py b/xpu_timer/config_subparsers/hpu.py new file mode 100644 index 0000000000..135943578e --- /dev/null +++ b/xpu_timer/config_subparsers/hpu.py @@ -0,0 +1,67 @@ +import textwrap + +from . import BaseBuildRender + + +class BuildRender(BaseBuildRender): + @classmethod + def add_arguments(cls, parser): + BaseBuildRender.add_arguments(parser) + + def __post_init__(self): + if self.args.sdk_path is None: + self.args.sdk_path = "/usr/local/Ascend/ascend-toolkit/latest/" + + def rend_config_bzl(self): + hpu_config = textwrap.dedent( + """ + XPU_TIMER_CONFIG = struct( + linkopt = [ + "-Wl,--version-script=$(location //xpu_timer/hpu:only_keep_hpu.lds)", + "-L{hpu_path}/lib64", + ], + copt = [ + "-DXPU_HPU", + ], + deps = ["@hpu//:hpu_headers", "//xpu_timer/hpu:only_keep_hpu.lds"], + timer_deps = ["//xpu_timer/hpu:hpu_timer"], + hook_deps = ["//xpu_timer/hpu:hpu_hook"], + )""" + ) + + self.xpu_timer_config.append(hpu_config.format(hpu_path=self.sdk_path)) + return "\n".join(self.xpu_timer_config) + + def rend_bazelrc(self): + self.bazelrc_config.append(f"build --repo_env=HPU_HOME={self.sdk_path}") + return "\n".join(self.bazelrc_config) + + def setup_files(self): + with open("WORKSPACE.template") as f: + workspace = f.read() + + deps = textwrap.dedent( + """ + load("//third_party/hpu:hpu_workspace.bzl", "hpu_workspace") + hpu_workspace() + """ + ) + return workspace + deps + + def setup_platform_version(self): + # /usr/local/Ascend/ascend-toolkit/latest/version.cfg + + version = None + path = f"{self.sdk_path}/version.cfg" + pattern = "toolkit_installed_version=" + with open(path) as f: + for line in f: + if line.startswith(pattern): + version = line.split(pattern)[-1] + break + if version is None: + raise ValueError("Cannot found version") + + major = version.split(":")[-1].replace("]", "").replace(".", "") + minor = "" + return f"hu{major}{minor}", "HPU" diff --git a/xpu_timer/xpu_timer/hpu/hpu_timer.cc b/xpu_timer/xpu_timer/hpu/hpu_timer.cc index af91ea50b6..cfc1b497ce 100644 --- a/xpu_timer/xpu_timer/hpu/hpu_timer.cc +++ b/xpu_timer/xpu_timer/hpu/hpu_timer.cc @@ -416,7 +416,8 @@ std::function InterceptManager::handleMatmul( timer->trace->set_kernel_type( constant::Metrics::MatmulMetrics::KERNEL_TYPE); - hook::MatmulDebugData* mm_debug = timer->trace->mutable_mm_debug(); + hook::KernelDebugData* debug_data = timer->trace->mutable_debug_data(); + hook::MatmulDebugData* mm_debug = debug_data->mutable_mm_debug(); std::string compute_dtype = HpuDataTypeUtils::getAclDtype(matmul_info->dtype_); @@ -616,8 +617,8 @@ InterceptManager::handleGroupedMatmulV2(aclOpExecutor* executor) { timer->trace->set_kernel_type( constant::Metrics::MatmulMetrics::KERNEL_TYPE); - hook::GroupedMatmulDebugData* grouped_mm_debug = - timer->trace->mutable_grouped_mm_debug(); + hook::KernelDebugData* debug_data = timer->trace->mutable_debug_data(); + hook::GroupedMatmulDebugData* grouped_mm_debug = debug_data->mutable_grouped_mm_debug(); std::string compute_dtype = HpuDataTypeUtils::getAclDtype(grouped_matmul_info->dtype_); @@ -653,7 +654,8 @@ std::function InterceptManager::handleHccl( std::ostringstream oss; timer->trace->Clear(); timer->trace->set_kernel_type(constant::Metrics::CollMetrics::KERNEL_TYPE); - hook::NcclDebugData* hccl_debug = timer->trace->mutable_nccl_debug(); + hook::KernelDebugData* debug_data = timer->trace->mutable_debug_data(); + hook::NcclDebugData* hccl_debug = debug_data->mutable_nccl_debug(); std::string dtype = HpuDataTypeUtils::getHcclDataType(datatype); uint64_t comm_size = count * HpuDataTypeUtils::getDtypeSizeInBytes(dtype); diff --git a/xpu_timer/xpu_timer/protos/hook.proto b/xpu_timer/xpu_timer/protos/hook.proto index d6b2ca8133..bd05e553ab 100644 --- a/xpu_timer/xpu_timer/protos/hook.proto +++ b/xpu_timer/xpu_timer/protos/hook.proto @@ -31,9 +31,26 @@ message InterceptSymbolByOffset { map symbols = 1; } -message NcclDebugData { +message KernelDebugData { + oneof launch_data { + NvidiaLaunchData nvidia_launch_data = 1; + } + oneof kernel_data { + MatmulDebugData mm_debug = 3; + NcclDebugData nccl_debug = 4; + FaDebugData fa_debug = 5; + MemoryDebugData memory_debug = 6; + GroupedMatmulDebugData grouped_mm_debug = 7; + } +} + +message NvidiaLaunchData { repeated uint32 grids = 1; // dim3 repeated uint32 blocks = 2; // dim3 + uint32 stream_id = 3; +} + +message NcclDebugData { uint64 comm_hash = 3; uint64 input_size_in_bytes = 4; // diff coll operator are different string dtype = 5; @@ -83,14 +100,8 @@ message KernelTrace { uint32 dur_us = 4; uint32 delay_us = 5; uint64 trace_id = 6; - oneof debug_data { - MatmulDebugData mm_debug = 7; - FaDebugData fa_debug = 8; - NcclDebugData nccl_debug = 9; - MemoryDebugData memory_debug = 10; - GroupedMatmulDebugData grouped_mm_debug = 12; - } - bool is_host = 11; + KernelDebugData debug_data = 7; + bool is_host = 8; } message GcDebugData { From b5bd28a23070963b2ee3390bac067512ad70cda1 Mon Sep 17 00:00:00 2001 From: BO SANG Date: Sun, 20 Jul 2025 18:37:53 -0700 Subject: [PATCH 3/3] update --- xpu_timer/third_party/hpu/BUILD.bazel | 0 xpu_timer/third_party/hpu/hpu.BUILD | 34 +++++++++++++++++++++ xpu_timer/third_party/hpu/hpu_workspace.bzl | 10 ++++++ 3 files changed, 44 insertions(+) create mode 100644 xpu_timer/third_party/hpu/BUILD.bazel create mode 100644 xpu_timer/third_party/hpu/hpu.BUILD create mode 100644 xpu_timer/third_party/hpu/hpu_workspace.bzl diff --git a/xpu_timer/third_party/hpu/BUILD.bazel b/xpu_timer/third_party/hpu/BUILD.bazel new file mode 100644 index 0000000000..e69de29bb2 diff --git a/xpu_timer/third_party/hpu/hpu.BUILD b/xpu_timer/third_party/hpu/hpu.BUILD new file mode 100644 index 0000000000..ae3a994d22 --- /dev/null +++ b/xpu_timer/third_party/hpu/hpu.BUILD @@ -0,0 +1,34 @@ +package( + default_visibility=["//visibility:public"] +) + + +cc_library( + name = "hpu_headers", + hdrs = glob( + include=["include/**/*.h", "include/**/*.hpp"], + ), + includes = ["include", "include/aclnn"], + +) + +cc_import( + name = "opapi", + shared_library = "lib64/libopapi.so", + includes = ["include"], + alwayslink = 1, +) + + +cc_import( + name = "ascendcl", + shared_library = "lib64/libascendcl.so", + includes = ["include"], + alwayslink = 1, +) + +cc_import( + name = "hccl", + shared_library = "lib64/libhccl.so", + alwayslink = 1, +) diff --git a/xpu_timer/third_party/hpu/hpu_workspace.bzl b/xpu_timer/third_party/hpu/hpu_workspace.bzl new file mode 100644 index 0000000000..e894d7cfee --- /dev/null +++ b/xpu_timer/third_party/hpu/hpu_workspace.bzl @@ -0,0 +1,10 @@ +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("//:workspace.bzl", "ali_code_repository", "dynamic_local_repository") + +def hpu_workspace(): + dynamic_local_repository( + name = "hpu", + include_default_path = "/usr/local/Ascend/ascend-toolkit/latest", + build_file = "//third_party/hpu:hpu.BUILD", + include = "ASCEND_TOOLKIT_HOME", + )