From bf93524af782db46ecc5a350f8e8145f98b0a71f Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 11 Sep 2023 08:28:28 +0000 Subject: [PATCH 01/55] fix readme --- FasterTransformerReadME.md | 417 ++++++++++++++++++++++++++++++++++++ README.md | 418 +------------------------------------ 2 files changed, 425 insertions(+), 410 deletions(-) create mode 100644 FasterTransformerReadME.md diff --git a/FasterTransformerReadME.md b/FasterTransformerReadME.md new file mode 100644 index 000000000..a00e0d631 --- /dev/null +++ b/FasterTransformerReadME.md @@ -0,0 +1,417 @@ +# FasterTransformer + +This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. + +## Table Of Contents + +- [FasterTransformer](#fastertransformer) + - [Table Of Contents](#table-of-contents) + - [Model overview](#model-overview) + - [Support matrix](#support-matrix) + - [Advanced](#advanced) + - [Global Environment](#global-environment) + - [Performance](#performance) + - [BERT base performance](#bert-base-performance) + - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features) + - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow) + - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch) + - [Decoding and Decoder performance](#decoding-and-decoder-performance) + - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow) + - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch) + - [GPT performance](#gpt-performance) + - [Release notes](#release-notes) + - [Changelog](#changelog) + - [Known issues](#known-issues) + +## Model overview + +In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16. + +FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks. + +### Support matrix + +| Models | Framework | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) | +| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ | +| BERT | TensorFlow | Yes | Yes | - | - | - | - | +| BERT | PyTorch | Yes | Yes | Yes | Yes | Yes | - | +| BERT | Triton backend | Yes | - | - | Yes | Yes | - | +| BERT | C++ | Yes | Yes | - | - | - | Yes | +| XLNet | C++ | Yes | - | - | - | - | - | +| Encoder | TensorFlow | Yes | Yes | - | - | - | - | +| Encoder | PyTorch | Yes | Yes | Yes | - | - | - | +| Decoder | TensorFlow | Yes | - | - | - | - | - | +| Decoder | PyTorch | Yes | - | - | - | - | - | +| Decoding | TensorFlow | Yes | - | - | - | - | - | +| Decoding | PyTorch | Yes | - | - | - | - | - | +| GPT | TensorFlow | Yes | - | - | - | - | - | +| GPT/OPT | PyTorch | Yes | - | - | Yes | Yes | Yes | +| GPT/OPT | Triton backend | Yes | - | - | Yes | Yes | - | +| GPT-MoE | PyTorch | Yes | - | - | Yes | Yes | - | +| BLOOM | PyTorch | Yes | - | - | Yes | Yes | - | +| BLOOM | Triton backend | Yes | - | - | Yes | Yes | - | +| GPT-J | Triton backend | Yes | - | - | Yes | Yes | - | +| Longformer | PyTorch | Yes | - | - | - | - | - | +| T5/UL2 | PyTorch | Yes | - | - | Yes | Yes | - | +| T5 | TensorFlow 2 | Yes | - | - | - | - | - | +| T5/UL2 | Triton backend | Yes | - | - | Yes | Yes | - | +| T5 | TensorRT | Yes | - | - | Yes | Yes | - | +| T5-MoE | PyTorch | Yes | - | - | Yes | Yes | - | +| Swin Transformer | PyTorch | Yes | Yes | - | - | - | - | +| Swin Transformer | TensorRT | Yes | Yes | - | - | - | - | +| ViT | PyTorch | Yes | Yes | - | - | - | - | +| ViT | TensorRT | Yes | Yes | - | - | - | - | +| GPT-NeoX | PyTorch | Yes | - | - | Yes | Yes | - | +| GPT-NeoX | Triton backend | Yes | - | - | Yes | Yes | - | +| BART/mBART | PyTorch | Yes | - | - | Yes | Yes | - | +| WeNet | C++ | Yes | - | - | - | - | - | +| DeBERTa | TensorFlow 2 | Yes | - | - | On-going | On-going | - | +| DeBERTa | PyTorch | Yes | - | - | On-going | On-going | - | + +* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++. + +More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together. + +## Advanced + +The following code lists the directory structure of FasterTransformer: + +``` +/src/fastertransformer: source code of FasterTransformer + |--/cutlass_extensions: Implementation of cutlass gemm/kernels. + |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual. + |--/layers: Implementation of layer modules, like attention layer, ffn layer. + |--/models: Implementation of different models, like BERT, GPT. + |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin. + |--/tf_op: custom Tensorflow OP implementation + |--/th_op: custom PyTorch OP implementation + |--/triton_backend: custom triton backend implementation + |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils +/examples: C++, tensorflow and pytorch interface examples + |--/cpp: C++ interface examples + |--/pytorch: PyTorch OP examples + |--/tensorflow: TensorFlow OP examples + |--/tensorrt: TensorRT examples +/docs: Documents to explain the details of implementation of different models, and show the benchmark +/benchmark: Contains the scripts to run the benchmarks of different models +/tests: Unit tests +/templates: Documents to explain how to add a new model/example into FasterTransformer repo +``` + +Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`. + + +### Global Environment + +FasterTransformer provides some convenient environment variables for debuging and testing. + +1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow. +2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program. +3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging. + +## Performance + +Hardware settings: + +* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor +* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz + +In order to run the following benchmark, we need to install the unix computing tool "bc" by + +```bash +apt-get install bc +``` + +### BERT base performance + +The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`. + +The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`. + +The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`. + +The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`. + +More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance). + +#### BERT base performances of FasterTransformer new features + +The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4. + +For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case. + +
+ +#### BERT base performance on TensorFlow + +The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4. + +For small batch size and sequence length, using FasterTransformer can bring about 3x speedup. + +For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. + +
+ +#### BERT base performance on PyTorch + +The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4. + +For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup. + +For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. + +
+ +### Decoding and Decoder performance + +The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh` + +The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`. + +In the experiments of decoding, we updated the following parameters: + +* head_num = 8 +* size_per_head = 64 +* num_layers = 6 for both encoder and decoder +* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes +* memory_hidden_dim = 512 +* max sequenc elength = 128 + +More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance). + +#### Decoder and Decoding end-to-end translation performance on TensorFlow + +The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup. + +
+ +#### Decoder and Decoding end-to-end translation performance on PyTorch + +The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup. + +
+ +### GPT performance + +The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100. + +In the experiments of decoding, we updated the following parameters: + +* head_num = 96 +* size_per_head = 128 +* num_layers = 48 for GPT-89B model, 96 for GPT-175B model +* data_type = FP16 +* vocab_size = 51200 +* top_p = 0.9 +* tensor parallel size = 8 +* input sequence length = 512 +* output sequence length = 32 + +
+ +## Release notes + +### Changelog + +May 2023 +- Fix bugs of generation early stopping + +January 2023 +- Support GPT MoE +- Support FP8 for Bert and GPT (**Experimental**) +- Support DeBERTa on TensorFlow 2 and PyTorch + +Dec 2022 +- **Release the FasterTransformer 5.2** +- Support min length penalty + +Nov 2022 +- Support T5 Tensorflow 2 custom op. +- Support T5 MoE +- Support WeNet +- Support BART & mBART +- Support SwinV2 +- Initial support for w8a8 int8 mode with GPT (preview) +- Support fused mha in GPT + +Oct 2022 +- Support BLOOM + +Sep 2022 +- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt +- Support for IA3 adapting scheme in T5 + +Aug 2022 +- Support returning context tokens embeddings in GPT +- **Release the FasterTransformer 5.1** +- Support for interactive generation +- Support for attention time-limited memory +- Support mt5 and t5-v1.1 + +July 2022 +- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2)) + - Fix bug of T5 under bfloat16. +- Add ViT INT8 TensorRT Plugin +- Support batch sampling +- Support shared context optimization in GPT model + +June 2022 +- Support streaming generation for triton backend. +- Support OPT. +- Support multi-node multi-GPU BERT under FP32, FP16 and BF16. + +May 2022 +- Support bfloat16 on most models. +- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J. +- Support GPT-NeoX. + - epsilon value used in layernorm is now a parameter + - rotary embedding GPT-NeoX style (only GPT-J was implemented) + - load per-GPU layernorm and bias parameters + - weight conversion from EleutherAI checkpoint + +April 2022 +- **Release the FasterTransformer 5.0** + - Change the default accumulation type of all gemm to FP32. + - Support bfloat16 inference in GPT model. + - Support Nemo Megatron T5 and Megatron-LM T5 model. + - Support ViT. + +March 2022 +- Support `stop_ids` and `ban_bad_ids` in GPT-J. +- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding. + +February 2022 +- Support Swin Transformer. +- Optimize the k/v cache update of beam search by in-direction buffer. +- Support runtime input for GPT-J, T5 and GPT. +- Support soft prompt in GPT and GPT-J. +- Support custom all reduce kernel. + - Limitation: + 1. Only support tensor parallel size = 8 on DGX-A100. + 2. Only support CUDA with cudaMallocAsync. + +December 2021 +- Add TensorRT plugin of T5 model. +- Change some hyper-parameters of GPT model to runtime query. +- Optimize the memory allocator under C++ code. +- Fix bug of CUB including when using CUDA 11.5 or newer version. + +November 2021 +- **Update the FasterTransformer 5.0 beta** +- Add GPT-3 INT8 weight only qauntization for batch size <= 2. +- Support multi-node multi-gpu support on T5. +- Enhance the multi-node multi-gpu supporting in GPT-3. + +August 2021 +- **Release the FasterTransformer 5.0 beta** + - Refactor the repo and codes + - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below. + - Bugs fix + - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper. + - Fix memory leak that occurs every forward because of reused allocator. + - Fix race condition that occurs in repetition penalty kernel. + - Enhancement + - Add random seed setting. + - Fix GEMM buffer overflow on FP16 of GPT. + - Change to invalidate finished buffer for every completion. + - Introduce stop_before for early stop. + - Support Longformer. + - Rename `layer_para` to `pipeline_para`. + - Optimize the sorting of top p sampling. + - Support sparsity for Ampere GPUs on BERT. + - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model. + - Support multi-node inference for GPT Triton backend. + +June 2021 +- Support XLNet + +April 2021 +- **Release the FasterTransformer 4.0** + - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch. + - Support single node, multi-gpus inference for GPT model on triton. + - Add the int8 fused multi-head attention kernel for bert. + - Add the FP16 fused multi-head attention kernel of V100 for bert. + - Optimize the kernel of decoder. + - Move to independent repo. + - Eager mode PyTorch extension is deprecated. + +Dec 2020 +- **Release the FasterTransformer 3.1** + - Optimize the decoding by adding the finisehd mask to prevent useless computing. + - Support opennmt encoder. + - Remove the TensorRT plugin supporting. + - TorchScript custom op is deprecated. + +Nov 2020 +- Optimize the INT8 inference. +- Support PyTorch INT8 inference. +- Provide PyTorch INT8 quantiztion tools. +- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer. +- Add unit test of SQuAD. +- Update the missed NGC checkpoints. + +Sep 2020 +- Support GPT2 +- **Release the FasterTransformer 3.0** + - Support INT8 quantization of encoder of cpp and TensorFlow op. + - Add bert-tf-quantization tool. + - Fix the issue that Cmake 15 or Cmake 16 fail to build this project. + +Aug 2020 +- Fix the bug of trt plugin. + +June 2020 +- **Release the FasterTransformer 2.1** + - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea. + - Optimize the beam search kernels. + - Add PyTorch op supporting + +May 2020 +- Fix the bug that seq_len of encoder must be larger than 3. +- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table. +- Modifying the method of loading model in `translate_sample.py`. + +April 2020 +- Rename `decoding_opennmt.h` to `decoding_beamsearch.h` +- Add DiverseSiblingsSearch for decoding. +- Add sampling into Decoding + - The implementation is in the `decoding_sampling.h` + - Add top_k sampling, top_p sampling for decoding. +- Refactor the tensorflow custom op codes. + - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc` + - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc` + - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc` +- Fix the bugs of finalize function decoding.py. +- Fix the bug of tf DiverseSiblingSearch. +- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3. +- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder. +- Add dynamic batch size and dynamic sequence length features into all ops. + +March 2020 +- Add feature in FasterTransformer 2.0 + - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf. +- Fix bugs of Fastertransformer 2.0 + - Fix the bug of maximum sequence length of decoder cannot be larger than 128. + - Fix the bug that decoding does not check finish or not after each step. + - Fix the bug of decoder about max_seq_len. + - Modify the decoding model structure to fit the OpenNMT-tf decoding model. + - Add a layer normalization layer after decoder. + - Add a normalization for inputs of decoder + +February 2020 +- **Release the FasterTransformer 2.0** + - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op. + - Refine the sample codes of encoder. + - Add dynamic batch size feature into encoder op. + +July 2019 +- **Release the FasterTransformer 1.0** + - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin. + +### Known issues + +- Cannot compile on tensorflow 2.10 due to undefined symbol issue. +- Undefined symbol errors when import the extension + - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc. +- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. +- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14. diff --git a/README.md b/README.md index a00e0d631..50f50cab2 100644 --- a/README.md +++ b/README.md @@ -1,417 +1,15 @@ -# FasterTransformer +# FasterTransformer for SaumsungCEChallenge -This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. +Check out FasterTransformer [README.md](FasterTransformerReadME.md) -## Table Of Contents +## Installation -- [FasterTransformer](#fastertransformer) - - [Table Of Contents](#table-of-contents) - - [Model overview](#model-overview) - - [Support matrix](#support-matrix) - - [Advanced](#advanced) - - [Global Environment](#global-environment) - - [Performance](#performance) - - [BERT base performance](#bert-base-performance) - - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features) - - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow) - - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch) - - [Decoding and Decoder performance](#decoding-and-decoder-performance) - - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow) - - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch) - - [GPT performance](#gpt-performance) - - [Release notes](#release-notes) - - [Changelog](#changelog) - - [Known issues](#known-issues) - -## Model overview - -In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16. - -FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks. - -### Support matrix - -| Models | Framework | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) | -| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ | -| BERT | TensorFlow | Yes | Yes | - | - | - | - | -| BERT | PyTorch | Yes | Yes | Yes | Yes | Yes | - | -| BERT | Triton backend | Yes | - | - | Yes | Yes | - | -| BERT | C++ | Yes | Yes | - | - | - | Yes | -| XLNet | C++ | Yes | - | - | - | - | - | -| Encoder | TensorFlow | Yes | Yes | - | - | - | - | -| Encoder | PyTorch | Yes | Yes | Yes | - | - | - | -| Decoder | TensorFlow | Yes | - | - | - | - | - | -| Decoder | PyTorch | Yes | - | - | - | - | - | -| Decoding | TensorFlow | Yes | - | - | - | - | - | -| Decoding | PyTorch | Yes | - | - | - | - | - | -| GPT | TensorFlow | Yes | - | - | - | - | - | -| GPT/OPT | PyTorch | Yes | - | - | Yes | Yes | Yes | -| GPT/OPT | Triton backend | Yes | - | - | Yes | Yes | - | -| GPT-MoE | PyTorch | Yes | - | - | Yes | Yes | - | -| BLOOM | PyTorch | Yes | - | - | Yes | Yes | - | -| BLOOM | Triton backend | Yes | - | - | Yes | Yes | - | -| GPT-J | Triton backend | Yes | - | - | Yes | Yes | - | -| Longformer | PyTorch | Yes | - | - | - | - | - | -| T5/UL2 | PyTorch | Yes | - | - | Yes | Yes | - | -| T5 | TensorFlow 2 | Yes | - | - | - | - | - | -| T5/UL2 | Triton backend | Yes | - | - | Yes | Yes | - | -| T5 | TensorRT | Yes | - | - | Yes | Yes | - | -| T5-MoE | PyTorch | Yes | - | - | Yes | Yes | - | -| Swin Transformer | PyTorch | Yes | Yes | - | - | - | - | -| Swin Transformer | TensorRT | Yes | Yes | - | - | - | - | -| ViT | PyTorch | Yes | Yes | - | - | - | - | -| ViT | TensorRT | Yes | Yes | - | - | - | - | -| GPT-NeoX | PyTorch | Yes | - | - | Yes | Yes | - | -| GPT-NeoX | Triton backend | Yes | - | - | Yes | Yes | - | -| BART/mBART | PyTorch | Yes | - | - | Yes | Yes | - | -| WeNet | C++ | Yes | - | - | - | - | - | -| DeBERTa | TensorFlow 2 | Yes | - | - | On-going | On-going | - | -| DeBERTa | PyTorch | Yes | - | - | On-going | On-going | - | - -* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++. - -More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together. - -## Advanced - -The following code lists the directory structure of FasterTransformer: ``` -/src/fastertransformer: source code of FasterTransformer - |--/cutlass_extensions: Implementation of cutlass gemm/kernels. - |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual. - |--/layers: Implementation of layer modules, like attention layer, ffn layer. - |--/models: Implementation of different models, like BERT, GPT. - |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin. - |--/tf_op: custom Tensorflow OP implementation - |--/th_op: custom PyTorch OP implementation - |--/triton_backend: custom triton backend implementation - |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils -/examples: C++, tensorflow and pytorch interface examples - |--/cpp: C++ interface examples - |--/pytorch: PyTorch OP examples - |--/tensorflow: TensorFlow OP examples - |--/tensorrt: TensorRT examples -/docs: Documents to explain the details of implementation of different models, and show the benchmark -/benchmark: Contains the scripts to run the benchmarks of different models -/tests: Unit tests -/templates: Documents to explain how to add a new model/example into FasterTransformer repo -``` - -Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`. - - -### Global Environment - -FasterTransformer provides some convenient environment variables for debuging and testing. - -1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow. -2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program. -3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging. - -## Performance - -Hardware settings: - -* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor -* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz - -In order to run the following benchmark, we need to install the unix computing tool "bc" by - -```bash -apt-get install bc +mkdir -p FasterTransformer/build +cd FasterTransformer/build +git submodule init && git submodule update +cmake -DSM=xx -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON . +make -j32 ``` -### BERT base performance - -The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`. - -The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`. - -The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`. - -The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`. - -More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance). - -#### BERT base performances of FasterTransformer new features - -The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4. - -For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case. - -
- -#### BERT base performance on TensorFlow - -The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4. - -For small batch size and sequence length, using FasterTransformer can bring about 3x speedup. - -For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. - -
- -#### BERT base performance on PyTorch - -The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4. - -For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup. - -For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. - -
- -### Decoding and Decoder performance - -The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh` - -The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`. - -In the experiments of decoding, we updated the following parameters: - -* head_num = 8 -* size_per_head = 64 -* num_layers = 6 for both encoder and decoder -* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes -* memory_hidden_dim = 512 -* max sequenc elength = 128 - -More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance). - -#### Decoder and Decoding end-to-end translation performance on TensorFlow - -The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup. - -
- -#### Decoder and Decoding end-to-end translation performance on PyTorch - -The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup. - -
- -### GPT performance - -The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100. - -In the experiments of decoding, we updated the following parameters: - -* head_num = 96 -* size_per_head = 128 -* num_layers = 48 for GPT-89B model, 96 for GPT-175B model -* data_type = FP16 -* vocab_size = 51200 -* top_p = 0.9 -* tensor parallel size = 8 -* input sequence length = 512 -* output sequence length = 32 - -
- -## Release notes - -### Changelog - -May 2023 -- Fix bugs of generation early stopping - -January 2023 -- Support GPT MoE -- Support FP8 for Bert and GPT (**Experimental**) -- Support DeBERTa on TensorFlow 2 and PyTorch - -Dec 2022 -- **Release the FasterTransformer 5.2** -- Support min length penalty - -Nov 2022 -- Support T5 Tensorflow 2 custom op. -- Support T5 MoE -- Support WeNet -- Support BART & mBART -- Support SwinV2 -- Initial support for w8a8 int8 mode with GPT (preview) -- Support fused mha in GPT - -Oct 2022 -- Support BLOOM - -Sep 2022 -- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt -- Support for IA3 adapting scheme in T5 - -Aug 2022 -- Support returning context tokens embeddings in GPT -- **Release the FasterTransformer 5.1** -- Support for interactive generation -- Support for attention time-limited memory -- Support mt5 and t5-v1.1 - -July 2022 -- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2)) - - Fix bug of T5 under bfloat16. -- Add ViT INT8 TensorRT Plugin -- Support batch sampling -- Support shared context optimization in GPT model - -June 2022 -- Support streaming generation for triton backend. -- Support OPT. -- Support multi-node multi-GPU BERT under FP32, FP16 and BF16. - -May 2022 -- Support bfloat16 on most models. -- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J. -- Support GPT-NeoX. - - epsilon value used in layernorm is now a parameter - - rotary embedding GPT-NeoX style (only GPT-J was implemented) - - load per-GPU layernorm and bias parameters - - weight conversion from EleutherAI checkpoint - -April 2022 -- **Release the FasterTransformer 5.0** - - Change the default accumulation type of all gemm to FP32. - - Support bfloat16 inference in GPT model. - - Support Nemo Megatron T5 and Megatron-LM T5 model. - - Support ViT. - -March 2022 -- Support `stop_ids` and `ban_bad_ids` in GPT-J. -- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding. - -February 2022 -- Support Swin Transformer. -- Optimize the k/v cache update of beam search by in-direction buffer. -- Support runtime input for GPT-J, T5 and GPT. -- Support soft prompt in GPT and GPT-J. -- Support custom all reduce kernel. - - Limitation: - 1. Only support tensor parallel size = 8 on DGX-A100. - 2. Only support CUDA with cudaMallocAsync. - -December 2021 -- Add TensorRT plugin of T5 model. -- Change some hyper-parameters of GPT model to runtime query. -- Optimize the memory allocator under C++ code. -- Fix bug of CUB including when using CUDA 11.5 or newer version. - -November 2021 -- **Update the FasterTransformer 5.0 beta** -- Add GPT-3 INT8 weight only qauntization for batch size <= 2. -- Support multi-node multi-gpu support on T5. -- Enhance the multi-node multi-gpu supporting in GPT-3. - -August 2021 -- **Release the FasterTransformer 5.0 beta** - - Refactor the repo and codes - - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below. - - Bugs fix - - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper. - - Fix memory leak that occurs every forward because of reused allocator. - - Fix race condition that occurs in repetition penalty kernel. - - Enhancement - - Add random seed setting. - - Fix GEMM buffer overflow on FP16 of GPT. - - Change to invalidate finished buffer for every completion. - - Introduce stop_before for early stop. - - Support Longformer. - - Rename `layer_para` to `pipeline_para`. - - Optimize the sorting of top p sampling. - - Support sparsity for Ampere GPUs on BERT. - - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model. - - Support multi-node inference for GPT Triton backend. - -June 2021 -- Support XLNet - -April 2021 -- **Release the FasterTransformer 4.0** - - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch. - - Support single node, multi-gpus inference for GPT model on triton. - - Add the int8 fused multi-head attention kernel for bert. - - Add the FP16 fused multi-head attention kernel of V100 for bert. - - Optimize the kernel of decoder. - - Move to independent repo. - - Eager mode PyTorch extension is deprecated. - -Dec 2020 -- **Release the FasterTransformer 3.1** - - Optimize the decoding by adding the finisehd mask to prevent useless computing. - - Support opennmt encoder. - - Remove the TensorRT plugin supporting. - - TorchScript custom op is deprecated. - -Nov 2020 -- Optimize the INT8 inference. -- Support PyTorch INT8 inference. -- Provide PyTorch INT8 quantiztion tools. -- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer. -- Add unit test of SQuAD. -- Update the missed NGC checkpoints. - -Sep 2020 -- Support GPT2 -- **Release the FasterTransformer 3.0** - - Support INT8 quantization of encoder of cpp and TensorFlow op. - - Add bert-tf-quantization tool. - - Fix the issue that Cmake 15 or Cmake 16 fail to build this project. - -Aug 2020 -- Fix the bug of trt plugin. - -June 2020 -- **Release the FasterTransformer 2.1** - - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea. - - Optimize the beam search kernels. - - Add PyTorch op supporting - -May 2020 -- Fix the bug that seq_len of encoder must be larger than 3. -- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table. -- Modifying the method of loading model in `translate_sample.py`. - -April 2020 -- Rename `decoding_opennmt.h` to `decoding_beamsearch.h` -- Add DiverseSiblingsSearch for decoding. -- Add sampling into Decoding - - The implementation is in the `decoding_sampling.h` - - Add top_k sampling, top_p sampling for decoding. -- Refactor the tensorflow custom op codes. - - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc` - - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc` - - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc` -- Fix the bugs of finalize function decoding.py. -- Fix the bug of tf DiverseSiblingSearch. -- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3. -- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder. -- Add dynamic batch size and dynamic sequence length features into all ops. - -March 2020 -- Add feature in FasterTransformer 2.0 - - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf. -- Fix bugs of Fastertransformer 2.0 - - Fix the bug of maximum sequence length of decoder cannot be larger than 128. - - Fix the bug that decoding does not check finish or not after each step. - - Fix the bug of decoder about max_seq_len. - - Modify the decoding model structure to fit the OpenNMT-tf decoding model. - - Add a layer normalization layer after decoder. - - Add a normalization for inputs of decoder - -February 2020 -- **Release the FasterTransformer 2.0** - - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op. - - Refine the sample codes of encoder. - - Add dynamic batch size feature into encoder op. - -July 2019 -- **Release the FasterTransformer 1.0** - - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin. - -### Known issues - -- Cannot compile on tensorflow 2.10 due to undefined symbol issue. -- Undefined symbol errors when import the extension - - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc. -- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. -- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14. From 6bbba8602919f6234bb6133a0381d43a5ab746a0 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 11 Sep 2023 08:59:12 +0000 Subject: [PATCH 02/55] add lamma template --- src/fastertransformer/models/CMakeLists.txt | 1 + .../models/llama/CMakeLists.txt | 69 + src/fastertransformer/models/llama/LLaMA.cc | 1211 +++++++++++++++++ src/fastertransformer/models/llama/LLaMA.h | 218 +++ .../models/llama/LLaMAContextDecoder.cc | 514 +++++++ .../models/llama/LLaMAContextDecoder.h | 117 ++ .../models/llama/LLaMADecoder.cc | 391 ++++++ .../models/llama/LLaMADecoder.h | 104 ++ .../models/llama/LLaMADecoderLayerWeight.cc | 220 +++ .../models/llama/LLaMADecoderLayerWeight.h | 62 + .../models/llama/LLaMAWeight.cc | 302 ++++ .../models/llama/LLaMAWeight.h | 106 ++ .../th_op/llama/CMakeLists.txt | 17 + src/fastertransformer/th_op/llama/LLaMA.cc | 164 +++ src/fastertransformer/th_op/llama/LLaMA.h | 346 +++++ 15 files changed, 3842 insertions(+) create mode 100644 src/fastertransformer/models/llama/CMakeLists.txt create mode 100644 src/fastertransformer/models/llama/LLaMA.cc create mode 100644 src/fastertransformer/models/llama/LLaMA.h create mode 100644 src/fastertransformer/models/llama/LLaMAContextDecoder.cc create mode 100644 src/fastertransformer/models/llama/LLaMAContextDecoder.h create mode 100644 src/fastertransformer/models/llama/LLaMADecoder.cc create mode 100644 src/fastertransformer/models/llama/LLaMADecoder.h create mode 100644 src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc create mode 100644 src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h create mode 100644 src/fastertransformer/models/llama/LLaMAWeight.cc create mode 100644 src/fastertransformer/models/llama/LLaMAWeight.h create mode 100755 src/fastertransformer/th_op/llama/CMakeLists.txt create mode 100755 src/fastertransformer/th_op/llama/LLaMA.cc create mode 100755 src/fastertransformer/th_op/llama/LLaMA.h diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index 248b4af3d..afc4f8b7b 100644 --- a/src/fastertransformer/models/CMakeLists.txt +++ b/src/fastertransformer/models/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(t5) add_subdirectory(bart) add_subdirectory(gptj) add_subdirectory(gptneox) +add_subdirectory(llama) add_subdirectory(multi_gpu_gpt) if(ENABLE_FP8) add_subdirectory(gpt_fp8) diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt new file mode 100644 index 000000000..da314ec7d --- /dev/null +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -0,0 +1,69 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.8) + +add_library(LLaMADecoderLayerWeight STATIC LLaMADecoderLayerWeight.cc) +set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger) + +add_library(LLaMADecoder STATIC LLaMADecoder.cc) +set_property(TARGET LLaMADecoder PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMADecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMADecoder PUBLIC -lcudart cublasMMWrapper + TensorParallelDecoderSelfAttentionLayer + TensorParallelGeluFfnLayer + layernorm_kernels + add_residual_kernels + LLaMADecoderLayerWeight + tensor + nccl_utils + cuda_utils + logger) + +add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc) +set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper + TensorParallelGptContextAttentionLayer + TensorParallelGeluFfnLayer + layernorm_kernels + add_residual_kernels + gpt_kernels + tensor + nccl_utils + cuda_utils + logger) + +add_library(LLaMAWeight STATIC LLaMAWeight.cc) +set_property(TARGET LLaMAWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAWeight PUBLIC LLaMADecoderLayerWeight cuda_utils logger) + +add_library(LLaMA STATIC LLaMA.cc) +set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMA PUBLIC -lcudart + LLaMADecoder + LLaMAContextDecoder + decoding_kernels + gpt_kernels + DynamicDecodeLayer + BaseBeamSearchLayer + bert_preprocess_kernels + tensor + LLaMAWeight + cuda_utils + logger) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc new file mode 100644 index 000000000..2ce2dae7b --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -0,0 +1,1211 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoX.h" +#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/decoding_kernels.h" +#include "src/fastertransformer/kernels/gpt_kernels.h" +#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" +#include + +namespace fastertransformer { + +template +void GptNeoX::initialize() +{ + gpt_context_decoder_ = new GptNeoXContextDecoder(head_num_, + size_per_head_, + inter_size_, + num_layer_, + rotary_embedding_dim_, + neox_rotary_style_, + use_gptj_residual_, + layernorm_eps_, + tensor_para_, + pipeline_para_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + is_context_qk_buf_float_, + attention_type_, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + gpt_decoder_ = new GptNeoXDecoder(head_num_, + size_per_head_, + inter_size_, + num_layer_, + rotary_embedding_dim_, + neox_rotary_style_, + use_gptj_residual_, + layernorm_eps_, + tensor_para_, + pipeline_para_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, + vocab_size_padded_, + 0, // end_id, deprecated + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + cuda_device_prop_); +} + +template +void GptNeoX::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void GptNeoX::allocateBuffer( + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + const size_t batchxbeam = batch_size * beam_width; + const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len + * hidden_units_ / tensor_para_.world_size_; + + if (vocab_size_ != vocab_size_padded_) { + padded_embedding_kernel_ = + (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true)); + padded_embedding_kernel_ptr_ = padded_embedding_kernel_; + + padded_embedding_bias_ = + (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true)); + } + + input_attention_mask_ = (T*)(allocator_->reMalloc( + input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + decoder_output_buf_ = + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + nccl_logits_buf_ = + (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); + h_finished_buf_ = new bool[batchxbeam]; + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + + key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); + value_cache_ = key_cache_ + self_cache_size; + if (beam_width > 1) { + cache_indirections_[0] = + (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true)); + cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len; + } + + // prompt_learning weight batch ptrs + prompt_learning_weight_batch_ = + (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); + tiled_prompt_lengths_buf_ = + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + + tiled_input_ids_buf_ = + (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); + tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); + tiled_total_padding_count_ = + (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); + + transposed_output_ids_buf_ = + (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); + masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); + + start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); + end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); + + context_decoder_input_buf_ = (T*)(allocator_->reMalloc( + context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + context_decoder_output_buf_ = (T*)(allocator_->reMalloc( + context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + output_log_probs_buf_ = + (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); + + generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + + is_allocate_buffer_ = true; +} + +template +void GptNeoX::freeBuffer() +{ + if (is_allocate_buffer_) { + if (vocab_size_ != vocab_size_padded_) { + padded_embedding_kernel_ptr_ = nullptr; + allocator_->free((void**)(&padded_embedding_kernel_)); + allocator_->free((void**)(&padded_embedding_bias_)); + } + + allocator_->free((void**)(&input_attention_mask_)); + allocator_->free((void**)(&decoder_input_buf_)); + allocator_->free((void**)(&decoder_output_buf_)); + allocator_->free((void**)(&normed_decoder_output_buf_)); + allocator_->free((void**)(&logits_buf_)); + allocator_->free((void**)(&nccl_logits_buf_)); + allocator_->free((void**)(&cum_log_probs_)); + allocator_->free((void**)(&finished_buf_)); + delete[] h_finished_buf_; + allocator_->free((void**)(&sequence_lengths_)); + + allocator_->free((void**)(&key_cache_)); + if (cache_indirections_[0] != nullptr) { + allocator_->free((void**)(&cache_indirections_)[0]); + } + + allocator_->free((void**)(&prompt_learning_weight_batch_)); + allocator_->free((void**)(&tiled_prompt_lengths_buf_)); + + allocator_->free((void**)(&tiled_input_ids_buf_)); + allocator_->free((void**)(&tiled_input_lengths_buf_)); + allocator_->free((void**)(&tiled_total_padding_count_)); + + allocator_->free((void**)(&transposed_output_ids_buf_)); + allocator_->free((void**)(&output_ids_buf_)); + allocator_->free((void**)(&parent_ids_buf_)); + allocator_->free((void**)(&seq_limit_len_)); + allocator_->free((void**)(&masked_tokens_)); + + allocator_->free((void**)(&start_ids_buf_)); + allocator_->free((void**)(&end_ids_buf_)); + + allocator_->free((void**)(&context_decoder_input_buf_)); + allocator_->free((void**)(&context_decoder_output_buf_)); + allocator_->free((void**)(&output_log_probs_buf_)); + + allocator_->free((void**)(&generation_should_stop_), true); + + is_allocate_buffer_ = false; + } +} + +template +GptNeoX::GptNeoX(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + prompt_learning_start_id_(prompt_learning_start_id), + prompt_learning_type_(prompt_learning_type), + use_gptj_residual_(use_gptj_residual), + hidden_units_(head_num * size_per_head), + local_head_num_(head_num / 1), + attention_type_(attention_type) +{ + tensor_para_.world_size_ = 1; + tensor_para_.rank_ = 0; + pipeline_para_.world_size_ = 1; + pipeline_para_.rank_ = 0; + + int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); + if (std::is_same::value) { + local_vacab_size = ceil(local_vacab_size / 8.f) * 8; + } + vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; + initialize(); +} + +template +GptNeoX::GptNeoX(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + prompt_learning_start_id_(prompt_learning_start_id), + prompt_learning_type_(prompt_learning_type), + use_gptj_residual_(use_gptj_residual), + hidden_units_(head_num * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + local_head_num_(head_num / tensor_para.world_size_), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce), + attention_type_(attention_type) +{ + int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); + if (std::is_same::value) { + local_vacab_size = ceil(local_vacab_size / 8.f) * 8; + } + vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; + initialize(); +} + +template +GptNeoX::GptNeoX(GptNeoX const& gpt): + BaseLayer(gpt), + head_num_(gpt.head_num_), + size_per_head_(gpt.size_per_head_), + inter_size_(gpt.inter_size_), + num_layer_(gpt.num_layer_), + vocab_size_(gpt.vocab_size_), + rotary_embedding_dim_(gpt.rotary_embedding_dim_), + start_id_(gpt.start_id_), + end_id_(gpt.end_id_), + prompt_learning_start_id_(gpt.prompt_learning_start_id_), + prompt_learning_type_(gpt.prompt_learning_type_), + use_gptj_residual_(gpt.use_gptj_residual_), + hidden_units_(gpt.hidden_units_), + tensor_para_(gpt.tensor_para_), + pipeline_para_(gpt.pipeline_para_), + local_head_num_(gpt.local_head_num_), + vocab_size_padded_(gpt.vocab_size_padded_), + custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), + enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), + attention_type_(gpt.attention_type_) +{ + initialize(); +} + +template +GptNeoX::~GptNeoX() +{ + delete gpt_decoder_; + delete dynamic_decode_layer_; + delete gpt_context_decoder_; + freeBuffer(); +} + +template +void GptNeoX::registerCallback(callback_sig* fn, void* ctx) +{ + token_generated_cb_ = fn; + token_generated_ctx_ = ctx; +} + +template +void GptNeoX::unRegisterCallback() +{ + token_generated_cb_ = nullptr; + token_generated_ctx_ = nullptr; +} + +template +void GptNeoX::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const GptNeoXWeight* gpt_weights) +{ + FT_CHECK(false); +} + +template +void GptNeoX::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const GptNeoXWeight* gpt_weights) +{ + // input_tensors: + // input_ids [batch_size, max_input_length] + // input_lengths [batch_size] + // prompt_learning_task_name_ids [batch_size] on cpu, optional + // output_seq_len [batch_size] on cpu + // start_id [batch_size] on cpu, optional + // end_id [batch_size] on cpu, optional + // stop_words_list [batch_size, 2, stop_words_length], optional + // bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional + // runtime_top_k [1] or [batch_size] on cpu, optional, uint. + // runtime_top_p [1] or [batch_size] on cpu, optional, float. + // beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float. + // temperature [1] or [batch_size] on cpu, optional, float. + // len_penalty [1] or [batch_size] on cpu, optional, float. + // repetition_penalty [1] or [batch_size] on cpu, optional, float. + // min_length [1] or [batch_size] on cpu, optional, int + // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. + // request_prompt_lengths [batch_size], optional + // request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional + // requst_prompt_type [batch_size], int, optional + // top_p_decay [batch_size] on gpu, float, optional + // top_p_min [batch_size] on gpu, float, optional + // top_p_reset_ids [batch_size] on gpu, uint32, optional + + // output_tensors: + // output_ids [batch_size, beam_width, max_output_seq_len] + // sequence_length [batch_size, beam_width] + // output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*. + // optional. It leads to additional computing cost. If we don't need this result, don't put it. + // cum_log_probs [batch_size, beam], optional, must be float*. + // optional. It leads to additional computing cost. If we don't need this result, don't put it. + + // Step is from max_input_length ~ max_output_seq_len, + // When step = k, we put output ids and caches at step k, and the sequence_length would be k - 1 before + // complete this step. + // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy + // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data + + FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3"); + FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2"); + FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); + FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); + FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end() + && input_tensors->at("output_seq_len").shape.size() == 1); + FT_CHECK(output_tensors->at("output_ids").shape.size() == 3); + FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2); + FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0], + "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]"); + + const size_t batch_size = output_tensors->at("output_ids").shape[0]; + const size_t beam_width = output_tensors->at("output_ids").shape[1]; + + PromptLearningType request_prompt_type = PromptLearningType::no_prompt; + int valid_prompt_inputs = input_tensors->count("request_prompt_type") + + input_tensors->count("request_prompt_lengths") + + input_tensors->count("request_prompt_embedding"); + + if (valid_prompt_inputs == 3) { + request_prompt_type = static_cast(input_tensors->at("request_prompt_type").getVal()); + FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids"); + } + else if (valid_prompt_inputs > 0) { + FT_LOG_WARNING( + "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!"); + } + if (request_prompt_type == PromptLearningType::prefix_prompt) { + FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!"); + } + + // Prefix Prompt Inputs + // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes + // pad) + // TODO (perkzz): move unnecessary paddings + const int* prompt_learning_task_name_ids = + input_tensors->count("prompt_learning_task_name_ids") ? + input_tensors->at("prompt_learning_task_name_ids").getPtr() : + nullptr; + has_prefix_prompt_ = + (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt); + int max_prefix_prompt_length = 0; + + FT_CHECK_WITH_INFO( + !(prompt_learning_task_name_ids != nullptr + && (prompt_learning_type_ == PromptLearningType::no_prompt + || prompt_learning_type_ == PromptLearningType::soft_prompt)), + "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided."); + + // NOTE: Prefix Prompt PreProcessing + // get prefix_prompt_weight for each batch --> shape [batch, beam_width] + // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] + std::vector prefix_prompt_weight_batch_ptrs; + std::vector prefix_prompt_lengths; + if (has_prefix_prompt_) { + for (int bs_id = 0; bs_id < batch_size; ++bs_id) { + int task_id = prompt_learning_task_name_ids[bs_id]; + // throw errors when prompt task_name_ids are not found + std::pair prefix_prompt_weight_length_pair; + try { + prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id); + } + catch (const std::out_of_range& oor) { + FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id); + throw oor; + } + for (int bw_id = 0; bw_id < beam_width; ++bw_id) { + prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first); + prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second); + } + } + + max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end()); + + FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length); + + if (max_prefix_prompt_length == 0) { + has_prefix_prompt_ = false; + FT_LOG_DEBUG("prompts are not applied !"); + } + } + + int max_input_length = input_tensors->at("input_ids").shape[1]; + FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0), + "Prefix Prompt should come with inputs!"); + + // Prefix Soft Prompt + has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt; + const size_t max_prefix_soft_prompt_length = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; + const size_t limit_len_offset = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0); + const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; + const size_t max_seq_len = max_output_seq_len; + // max cache seq len should include max prefix prompt length as it has k/v states + const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length; + if (max_cache_seq_len < max_seq_len) { + FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " + "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", + max_cache_seq_len, + max_seq_len); + } + else if (max_cache_seq_len > max_seq_len) { + FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). " + "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.", + max_cache_seq_len, + max_seq_len); + } + const cudaDataType_t gemm_data_type = getCudaDataType(); + allocateBuffer( + batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length); + setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); + + sync_check_cuda_error(); + { + TensorMap input_map(*input_tensors); + dynamic_decode_layer_->setup(batch_size, beam_width, &input_map); + handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size); + handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size); + } + + const DataType data_type = getTensorType(); + + const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_, + size_per_head_ / (16 / sizeof(T)), + max_cache_seq_len, + 16 / sizeof(T)}; + const std::vector self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_, + max_cache_seq_len, + size_per_head_}; + + // initialize the output ids and parent ids + cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_); + cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_); + if (beam_width > 1) { + cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + } + + // Prefix prompts + if (has_prefix_prompt_) { + cudaMemcpyAsync(prompt_learning_weight_batch_, + prefix_prompt_weight_batch_ptrs.data(), + sizeof(T*) * batch_size * beam_width, + cudaMemcpyDefault, + stream_); + cudaMemcpyAsync(tiled_prompt_lengths_buf_, + prefix_prompt_lengths.data(), + sizeof(int) * batch_size * beam_width, + cudaMemcpyDefault, + stream_); + } + + sync_check_cuda_error(); + + // handle first step + if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) { + invokeTileGptInputs(tiled_input_ids_buf_, + tiled_input_lengths_buf_, + input_tensors->at("input_ids").getPtr(), + input_tensors->at("input_lengths").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + sync_check_cuda_error(); + + if (has_prefix_soft_prompt_) { + inputIdsEmbeddingLookupPosEncodingSoftPromptParam param; + param.from_tensor = context_decoder_input_buf_; + param.output_ids = output_ids_buf_; + param.input_lengths = tiled_input_lengths_buf_; + param.embedding_table = gpt_weights->pre_decoder_embedding_table; + param.pos_table = gpt_weights->position_encoding_table; + param.prefix_soft_prompt_embedding = input_tensors->at("request_prompt_embedding").getPtr(); + param.prefix_soft_prompt_lengths = input_tensors->at("request_prompt_lengths").getPtr(); + param.input_ids = tiled_input_ids_buf_; + param.start_step = 1; + param.max_input_length = max_input_length; + param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; + param.batch_size = batch_size; + param.beam_width = beam_width; + param.hidden_units = hidden_units_; + param.stream = stream_; + + invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param); + sync_check_cuda_error(); + max_input_length += max_prefix_soft_prompt_length; // view soft_prompt as input + } + else { + invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, + output_ids_buf_, + gpt_weights->pre_decoder_embedding_table, + gpt_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size * beam_width, + hidden_units_, + stream_); + sync_check_cuda_error(); + } + + invokeBuildDecoderAttentionMask(input_attention_mask_, + tiled_input_lengths_buf_, + tiled_prompt_lengths_buf_, + batch_size * beam_width, + max_input_length, + max_prefix_prompt_length, + stream_); + sync_check_cuda_error(); + + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, + context_decoder_input_buf_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, + 1, + (size_t)max_input_length, + (size_t)(max_input_length + max_prefix_prompt_length)}, + input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}, + {"d_prefix_prompt_batch", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width}, + has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}}, + {"d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {batch_size * beam_width}, + has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, + context_decoder_output_buf_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}, + {"last_token_hidden_units", + Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; + + gpt_context_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + sync_check_cuda_error(); + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + nullptr, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + sync_check_cuda_error(); + } + else if (max_input_length == 0) { + FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt + && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case + max_input_length++; + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + output_ids_buf_, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + std::vector h_input_lengths(batch_size * beam_width, 1); + cudaMemcpyAsync(tiled_input_lengths_buf_, + h_input_lengths.data(), + sizeof(int) * batch_size * beam_width, + cudaMemcpyHostToDevice, + stream_); + sync_check_cuda_error(); + } + else if (max_input_length == 1) { + FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt + && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + nullptr, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + sync_check_cuda_error(); + invokeTileGptInputs(tiled_input_ids_buf_, + tiled_input_lengths_buf_, + input_tensors->at("input_ids").getPtr(), + input_tensors->at("input_lengths").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + sync_check_cuda_error(); + + cudaMemcpyAsync(output_ids_buf_, + tiled_input_ids_buf_, + sizeof(int) * batch_size * beam_width, + cudaMemcpyDeviceToDevice, + stream_); + } + + if (vocab_size_ == vocab_size_padded_) { + padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel; + } + else { + cudaMemcpyAsync(padded_embedding_kernel_, + gpt_weights->post_decoder_embedding.kernel, + sizeof(T) * vocab_size_ * hidden_units_, + cudaMemcpyDeviceToDevice, + stream_); + cudaMemcpyAsync(padded_embedding_bias_, + gpt_weights->post_decoder_embedding.bias, + sizeof(T) * vocab_size_, + cudaMemcpyDeviceToDevice, + stream_); + sync_check_cuda_error(); + } + + invokeMaskPaddingTokens(masked_tokens_, + input_tensors->at("input_lengths").getPtr(), // not_tiled + tiled_prompt_lengths_buf_, + max_cache_seq_len, + max_input_length + max_prefix_prompt_length, + 0, + batch_size, + beam_width, + stream_); + + for (int step = max_input_length; step < (int)max_output_seq_len; step++) { + const int src_indir_idx = (step - max_input_length) % 2; + const int tgt_indir_idx = 1 - src_indir_idx; + + const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); + FT_CHECK(batch_size % local_batch_size == 0); + const size_t iteration_num = batch_size / local_batch_size; + *generation_should_stop_ = true; + + for (uint ite = 0; ite < iteration_num; ++ite) { + const int id_offset = ite * local_batch_size * beam_width; + const int hidden_units_offset = id_offset * hidden_units_; + const int vocab_size_units_offset = id_offset * vocab_size_padded_; + + if (!(max_input_length > 1 && step == max_input_length)) { + if (pipeline_para_.rank_ == 0) { + invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset, + gpt_weights->pre_decoder_embedding_table, + gpt_weights->position_encoding_table, + output_ids_buf_ + id_offset, + tiled_total_padding_count_ + id_offset, + local_batch_size * beam_width, + hidden_units_, + (T)(1.0f), + step - 1, + batch_size * beam_width, + 0, + stream_); + sync_check_cuda_error(); + } + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size * beam_width, hidden_units_}, + decoder_input_buf_ + hidden_units_offset}}, + {"finished", + Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}}, + {"sequence_lengths", + Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}}, + {"total_padding_tokens", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size * beam_width}, + tiled_total_padding_count_ + id_offset}}, + {"d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size}, + has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}}, + {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}}, + {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, + {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, + {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}, + {"cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len : + nullptr}}, + {"masked_tokens", + Tensor{MEMORY_GPU, + TYPE_BOOL, + {local_batch_size * beam_width, max_cache_seq_len}, + masked_tokens_ + id_offset * max_cache_seq_len}}}; + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size * beam_width, hidden_units_}, + decoder_output_buf_ + hidden_units_offset}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; + gpt_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + } + + if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { + invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset, + decoder_output_buf_ + hidden_units_offset, + gpt_weights->post_decoder_layernorm.gamma, + gpt_weights->post_decoder_layernorm.beta, + layernorm_eps_, + local_batch_size * beam_width, + hidden_units_, + (float*)nullptr, + 0, + stream_); + sync_check_cuda_error(); + + if (tensor_para_.world_size_ == 1) { + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + vocab_size_padded_, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + logits_buf_ + vocab_size_units_offset, + CUDA_R_32F, + vocab_size_padded_, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); + } + else { + FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); + const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + local_vocab_size, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_ + + tensor_para_.rank_ * local_vocab_size * hidden_units_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + nccl_logits_buf_ + vocab_size_units_offset + + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, + CUDA_R_32F, + local_vocab_size, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); + ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + local_batch_size * beam_width * local_vocab_size, + tensor_para_.rank_, + tensor_para_, + stream_); + invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + tensor_para_.world_size_, + local_batch_size * beam_width, + local_vocab_size, + stream_); + } + + int tmp_local_batch_size = local_batch_size; + bool is_initialize_random_table = step == max_input_length; + std::unordered_map dynamic_decode_input_tensors{ + {"logits", + Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}}, + // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}}, + {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, + {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, + {"input_lengths", + Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}}, + {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}}, + {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, + {"src_cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}}, + {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}}, + {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}}, + {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}}; + + for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { + if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) { + dynamic_decode_input_tensors.insert(*t); + } + } + + // common outputs + bool subbatch_should_stop = false; + std::unordered_map dynamic_decode_output_tensors{ + {"output_ids", + Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}}, + {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}}, + // cum_log_probs is necessary for beam search, while it is optional for sampling. + {"cum_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + {batch_size * beam_width}, + ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ : + nullptr}}, + {"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + {max_seq_len, batch_size, beam_width}, + output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr ? + output_log_probs_buf_ : + nullptr}}, + {"parent_ids", + Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}}, + {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}}, + {"tgt_cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}}, + {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}}; + + for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) { + // Handle exceptions. + if (t->first == "cum_log_probs" || t->first == "output_log_probs") { + continue; + } + dynamic_decode_output_tensors.insert(*t); + } + + dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + *generation_should_stop_ &= subbatch_should_stop; + } + } + + if (pipeline_para_.world_size_ > 1) { + ftNcclGroupStart(); + ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width, + batch_size * beam_width, + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + + ftNcclBroadCast( + sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); + + ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); + + if (beam_width > 1) { + ftNcclBroadCast(cache_indirections_[tgt_indir_idx], + batch_size * beam_width * max_output_seq_len, + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + } + ftNcclGroupEnd(); + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + sync_check_cuda_error(); + } + + if (*generation_should_stop_) { + break; + } + if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) { + setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); + sendTensorsToFirstPipelineNode(output_tensors, input_tensors); + + if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) { + token_generated_cb_(output_tensors, token_generated_ctx_); + } + } + if (step == max_input_length) { + /* We have just finished processing input: update the padding count: + * total_padding_count += (max_input_length - input_lengths) + * if has prefix prompts, += (max_prefix_prompt_length - prompt_length) + */ + invokeUpdatePaddingCount(tiled_total_padding_count_, + input_tensors->at("input_lengths").getPtr(), // not_tiled + has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr, + max_input_length, + has_prefix_prompt_ ? max_prefix_prompt_length : 0, + batch_size, + beam_width, + stream_); + } + } + + setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); + sendTensorsToFirstPipelineNode(output_tensors, input_tensors); +} + +template +void GptNeoX::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + if (pipeline_para_.world_size_ == 1) { + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + return; + } + + const auto pp_rank = pipeline_para_.rank_; + + ftNcclGroupStart(); + for (auto const& it : *output_tensors) { + if (it.second.data == nullptr) { + continue; + } + + if (pp_rank == pipeline_para_.world_size_ - 1) { + ftNcclSend(it.second.getPtr(), it.second.sizeBytes(), 0, pipeline_para_, stream_); + } + else if (pp_rank == 0) { + ftNcclRecv(it.second.getPtr(), + it.second.sizeBytes(), + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + } + } + ftNcclGroupEnd(); + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); +} + +template +void GptNeoX::setOutputTensors(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const size_t max_input_length, + const size_t max_output_seq_len) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) { + return; + } + + const size_t batch_size = output_tensors->at("output_ids").shape[0]; + const size_t beam_width = output_tensors->at("output_ids").shape[1]; + uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); + const size_t max_prefix_soft_prompt_length = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; + + if (input_tensors->at("input_ids").shape[1] == 0) { + invokeCudaD2DcpyConvert( + sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); + // TODO: D2D sequence_lenghts + if (beam_width > 1) { + // For beam search, do gather_tree + // take output_parent_ids as inter buffer + invokeGatherTree(transposed_output_ids_buf_, + sequence_lengths_, + max_output_seq_len, + batch_size, + beam_width, + output_ids_buf_ + batch_size * beam_width, + parent_ids_buf_ + batch_size * beam_width, + end_ids_buf_, + stream_); + + // transpose and take output_parent_ids as inter buffer + invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), + transposed_output_ids_buf_, + max_output_seq_len - 1, + batch_size * beam_width, + 1, + stream_); + } + else { + // For sampling, only copy the results to output_tensor + invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), + output_ids_buf_ + batch_size * beam_width, + max_output_seq_len - 1, + batch_size * beam_width, + 1, + stream_); + } + } + else { + + // For sampling, it is equivalent to all parent ids are 0. + gatherTreeParam param; + param.beams = transposed_output_ids_buf_; + param.max_sequence_lengths = sequence_lengths_; + // add sequence_length 1 here because the sequence_length of time step t is t - 1 + param.max_sequence_length_final_step = 1; + param.max_time = max_output_seq_len; + param.batch_size = batch_size; + param.beam_width = beam_width; + param.step_ids = output_ids_buf_; + param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; + param.end_tokens = end_ids_buf_; + param.max_input_length = max_input_length; + param.prefix_soft_prompt_lengths = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr() : nullptr; + param.input_lengths = tiled_input_lengths_buf_; + param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; + param.max_input_without_prompt_length = max_input_length; + param.stream = stream_; + param.output_ids = output_tensors->at("output_ids").getPtr(); + invokeGatherTree(param); + invokeCudaD2DcpyConvert( + sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); + sync_check_cuda_error(); + } + if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) { + invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr(), + output_log_probs_buf_, + input_tensors->at("output_seq_len").max() - max_input_length, + batch_size * beam_width, + 1, + stream_); + } + // Return the cumulative log probability if requested. + if (output_tensors->count("cum_log_probs") > 0) { + Tensor cum_log_probs = output_tensors->at("cum_log_probs"); + FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width, + "The shape of cum_log_probs does not match with batch_size x beam_width."); + cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); + } +} + +template +size_t GptNeoX::getPipelineParallelRank() +{ + return pipeline_para_.rank_; +} + +template +size_t GptNeoX::getPipelineParallelSize() +{ + return pipeline_para_.world_size_; +} + +template +size_t GptNeoX::getTensorParallelRank() +{ + return tensor_para_.rank_; +} + +template +size_t GptNeoX::getTensorParallelSize() +{ + return tensor_para_.world_size_; +} + +template +bool* GptNeoX::getFinishBuffer() +{ + return finished_buf_; +} + +template class GptNeoX; +template class GptNeoX; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h new file mode 100644 index 000000000..9749a2070 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "src/fastertransformer/layers/DynamicDecodeLayer.h" +#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h" +#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h" +#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/prompt_learning.h" + +namespace fastertransformer { + +template +class GptNeoX: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t vocab_size_; + size_t rotary_embedding_dim_; + + static constexpr bool neox_rotary_style_ = true; + static constexpr float layernorm_eps_ = 1e-5f; + + int start_id_; + int end_id_; + size_t hidden_units_; + + size_t local_head_num_; + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + AttentionType attention_type_; + + size_t vocab_size_padded_; + const bool is_context_qk_buf_float_ = + (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || + std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); + + // Residual Type + const bool use_gptj_residual_ = true; + + // Prompt Learning Parameters + PromptLearningType prompt_learning_type_; + int prompt_learning_start_id_; // start_id for prompt_learning (only needed by prefix prompts) + bool has_prefix_prompt_; + bool has_prefix_soft_prompt_; + + GptNeoXDecoder* gpt_decoder_; + GptNeoXContextDecoder* gpt_context_decoder_; + DynamicDecodeLayer* dynamic_decode_layer_; + + void allocateBuffer() override; + void allocateBuffer( + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + void freeBuffer() override; + + void initialize(); + +protected: + T* padded_embedding_kernel_; + T* padded_embedding_bias_; + const T* padded_embedding_kernel_ptr_; + + T* input_attention_mask_; + + T* decoder_input_buf_; + T* decoder_output_buf_; + T* normed_decoder_output_buf_; + + float* logits_buf_; + float* nccl_logits_buf_; + float* cum_log_probs_; + + bool* finished_buf_; + bool* h_finished_buf_; + int* sequence_lengths_ = nullptr; + int* tiled_total_padding_count_ = nullptr; + uint32_t* seq_limit_len_ = nullptr; + + T* key_cache_; + T* value_cache_; + int* cache_indirections_[2] = {nullptr, nullptr}; + + // prompt_learning weight_batch ptrs + const T** prompt_learning_weight_batch_; + int* tiled_prompt_lengths_buf_; // only needed by prefix prompts + + int* tiled_input_ids_buf_; + int* tiled_input_lengths_buf_; + int* transposed_output_ids_buf_; + int* output_ids_buf_; + int* parent_ids_buf_; + int* start_ids_buf_; + int* end_ids_buf_; + bool* masked_tokens_ = nullptr; + + bool* generation_should_stop_ = nullptr; + + T* context_decoder_input_buf_; + T* context_decoder_output_buf_; + float* output_log_probs_buf_; + + // function pointer callback + using callback_sig = void(std::unordered_map*, void*); + callback_sig* token_generated_cb_ = nullptr; + void* token_generated_ctx_ = nullptr; + + void setOutputTensors(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const size_t max_input_length, + const size_t max_seq_len); + void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors); + +public: + GptNeoX(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); + + GptNeoX(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); + + GptNeoX(GptNeoX const& GptNeoX); + + ~GptNeoX(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const GptNeoXWeight* gpt_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const GptNeoXWeight* gpt_weights); + + size_t getPipelineParallelRank(); + size_t getPipelineParallelSize(); + size_t getTensorParallelRank(); + size_t getTensorParallelSize(); + bool* getFinishBuffer(); + + void registerCallback(callback_sig* fn, void* ctx); + void unRegisterCallback(); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc new file mode 100644 index 000000000..f23d1a977 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h" +#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/gpt_kernels.h" + +#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h" + +namespace fastertransformer { + +template +void GptNeoXContextDecoder::initialize() +{ + self_attention_layer_ = new TensorParallelGptContextAttentionLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + rotary_embedding_dim_, + neox_rotary_style_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + is_qk_buf_float_, + false, + 0, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + false, // use_gated_activation = false; + custom_all_reduce_comm_, + enable_custom_all_reduce_); +} + +template +void GptNeoXContextDecoder::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +{ + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + self_attn_output_ = reinterpret_cast( + allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + ffn_output_ = reinterpret_cast( + allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + decoder_layer_output_ = reinterpret_cast( + allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); + padding_offset_ = + reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); + cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + is_allocate_buffer_ = true; +} + +template +void GptNeoXContextDecoder::freeBuffer() +{ + if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); + allocator_->free((void**)(&self_attn_output_)); + allocator_->free((void**)(&ffn_output_)); + allocator_->free((void**)(&decoder_layer_output_)); + allocator_->free((void**)(&h_pinned_token_num_ptr_), true); + allocator_->free((void**)(&padding_offset_)); + allocator_->free((void**)(&cu_seqlens_)); + is_allocate_buffer_ = false; + } +} + +template +bool GptNeoXContextDecoder::isValidLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) + && (l < local_num_layer * (pipeline_para_.rank_ + 1)); +} + +template +bool GptNeoXContextDecoder::isFirstLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); +} + +template +bool GptNeoXContextDecoder::isLastLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); +} + +template +int GptNeoXContextDecoder::getFirstLayerParallelId() +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return local_num_layer * pipeline_para_.rank_; +} + +template +GptNeoXContextDecoder::GptNeoXContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + rotary_embedding_dim_(rotary_embedding_dim), + neox_rotary_style_(neox_rotary_style), + use_gptj_residual_(use_gptj_residual), + layernorm_eps_(layernorm_eps), + hidden_units_(head_num * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + is_qk_buf_float_(is_qk_buf_float), + attention_type_(attention_type), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce) +{ + initialize(); +} + +template +GptNeoXContextDecoder::GptNeoXContextDecoder(GptNeoXContextDecoder const& decoder): + BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), + head_num_(decoder.head_num_), + size_per_head_(decoder.size_per_head_), + inter_size_(decoder.inter_size_), + num_layer_(decoder.num_layer_), + rotary_embedding_dim_(decoder.rotary_embedding_dim_), + neox_rotary_style_(decoder.neox_rotary_style_), + use_gptj_residual_(decoder.use_gptj_residual_), + layernorm_eps_(decoder.layernorm_eps_), + hidden_units_(decoder.hidden_units_), + tensor_para_(decoder.tensor_para_), + pipeline_para_(decoder.pipeline_para_), + is_qk_buf_float_(decoder.is_qk_buf_float_), + attention_type_(decoder.attention_type_), + custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), + enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) +{ + initialize(); +} + +template +GptNeoXContextDecoder::~GptNeoXContextDecoder() +{ + delete self_attention_layer_; + delete ffn_layer_; + freeBuffer(); +} + +template +void GptNeoXContextDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, + {"attention_mask", input_tensors->at(1)}, + {"input_lengths", input_tensors->at(2)}}; + std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, + {"key_cache", output_tensors->at(1)}, + {"value_cache", output_tensors->at(2)}, + {"last_token_hidden_units", output_tensors->at(3)}}; + + forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight); +} + +template +void GptNeoXContextDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + // input tensors: + // decoder_input [batch_size, seq_len, hidden_dimension], + // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] + // input_lengths [batch_size] + // d_prefix_prompt_batch [batch_size], + // each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head] + // prefix_prompt_lengths [batch size] + + // output tensors: + // decoder_output [batch_size, seq_len, hidden_dimension], + // key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x] + // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] + // last_token_hidden_units [batch_size, hidden_dimension] + + // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'. + // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during + // computing. + + FT_CHECK(input_tensors->size() == 5); + FT_CHECK(output_tensors->size() == 4); + + const int batch_size = input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int max_prompt_length = + input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; + const DataType data_type = getTensorType(); + allocateBuffer(batch_size, seq_len); + + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + const T* attention_mask = input_tensors->at("attention_mask").getPtr(); + const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); + const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); + FT_CHECK(batch_size % local_batch_size == 0); + const int iteration_num = batch_size / local_batch_size; + + Tensor& k_cache = output_tensors->at("key_cache"); + Tensor& v_cache = output_tensors->at("value_cache"); + std::vector self_k_cache_size; + self_k_cache_size.push_back(local_batch_size); + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + self_k_cache_size.push_back(*t); + } + std::vector self_v_cache_size; + self_v_cache_size.push_back(local_batch_size); + for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { + self_v_cache_size.push_back(*t); + } + + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? + getUnfusedAttentionType(attention_type_) : + attention_type_; + const bool is_unpadded_mha = isUnPaddedMHA(attention_type); + + for (int ite = 0; ite < iteration_num; ite++) { + size_t h_token_num = local_batch_size * seq_len; + if (is_unpadded_mha) { + const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, + &h_token_num, + padding_offset_, + cu_seqlens_, + base_input_lengths + ite * local_batch_size, + local_batch_size, + seq_len, + stream_); + } + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; + } + + if (l == 0 && is_unpadded_mha) { + invokeRemovePadding(decoder_layer_output_, + decoder_input + ite * local_batch_size * seq_len * hidden_units_, + padding_offset_, + h_token_num, + hidden_units_, + stream_); + } + + const bool is_final = false; // TODO(bhsueh) remove this flag + T* layer_input = decoder_layer_output_; + T* layer_output = decoder_layer_output_; + if (!is_unpadded_mha) { + if (l == 0) { + layer_input = decoder_input; + layer_input += ite * local_batch_size * seq_len * hidden_units_; + } + if (l == num_layer_ - 1) { + layer_output = decoder_output; + layer_output += ite * local_batch_size * seq_len * hidden_units_; + } + } + + if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; + ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ - 1, + pipeline_para_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); + } + } + + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + 0, + stream_); + sync_check_cuda_error(); + + TensorMap self_attention_input_tensors{ + {"input_query", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, + attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, + {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, + {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; + self_attention_input_tensors.insertIfValid( + "d_prefix_prompt_batch", + Tensor{MEMORY_GPU, + data_type, + {(size_t)local_batch_size}, + d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr}); + self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {(size_t)local_batch_size}, + d_prefix_prompt_lengths != nullptr ? + d_prefix_prompt_lengths + ite * local_batch_size : + nullptr}); + + if (is_unpadded_mha) { + self_attention_input_tensors.insert("padding_offset", + Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_}); + self_attention_input_tensors.insert( + "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_}); + } + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + size_t ite_cache_offset = ite * local_batch_size; + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + ite_cache_offset *= *t; + } + cache_offset += ite_cache_offset; + + TensorMap self_attention_output_tensors{ + {"hidden_features", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", + Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &gpt_decoder_layer_weight->at(l)->self_attention_weights); + + if (is_final == false) { + if (use_gptj_residual_) { + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + 0, + stream_); + } + else { + invokeGeneralAddBiasResidualPreLayerNorm( + self_attn_output_, + decoder_normed_input_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + 0, + stream_); + } + + TensorMap ffn_input_tensors( + {{"ffn_input", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); + TensorMap ffn_output_tensors({{"ffn_output", + Tensor{MEMORY_GPU, + data_type, + {h_token_num, (size_t)hidden_units_}, + use_gptj_residual_ ? ffn_output_ : layer_output}}}); + ffn_layer_->forward( + &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + + if (use_gptj_residual_) { + // Original workflow: + // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) + // Our workflow: + // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / + // TP_size) + // They are equivalent on math, but we can use same buffer for layer_input and layer_output + + invokeAddBiasAttentionFfnResidual(layer_output, + ffn_output_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + tensor_para_.world_size_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllReduceSum( + layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_); + } + } + else { + invokeAddBiasResidual(layer_output, + self_attn_output_, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); + } + + sync_check_cuda_error(); + + if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; + ftNcclSend(layer_output + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ + 1, + pipeline_para_, + stream_); + } + + if ((l == num_layer_ - 1) && is_unpadded_mha) { + invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + decoder_layer_output_, + padding_offset_, + h_token_num, + head_num_ * size_per_head_, + stream_); + } + } + } + } + + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer + invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), + output_tensors->at("decoder_output").getPtr(), + input_tensors->at("input_lengths").getPtr(), + seq_len, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } +} + +template class GptNeoXContextDecoder; +template class GptNeoXContextDecoder; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h new file mode 100644 index 000000000..c81dcfe90 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/add_residual_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/BaseLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/utils/Tensor.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/utils/cublasMMWrapper.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace fastertransformer { + +template +class GptNeoXContextDecoder: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t rotary_embedding_dim_; + bool neox_rotary_style_; + bool use_gptj_residual_; + float layernorm_eps_; + + // calculated data + size_t hidden_units_; + + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + AttentionType attention_type_; + + bool is_qk_buf_float_; + + BaseAttentionLayer* self_attention_layer_; + FfnLayer* ffn_layer_; + + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len); + void freeBuffer() override; + + bool isValidLayerParallelId(uint l); + bool isFirstLayerParallelId(uint l); + bool isLastLayerParallelId(uint l); + int getFirstLayerParallelId(); + + void initialize(); + +protected: + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* ffn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + size_t* h_pinned_token_num_ptr_ = nullptr; + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + +public: + GptNeoXContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type = AttentionType::FUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce_ = 0); + + GptNeoXContextDecoder(GptNeoXContextDecoder const& decoder); + + ~GptNeoXContextDecoder(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* decoder_layer_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc new file mode 100644 index 000000000..7b73ba8ee --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoder.cc @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h" +#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h" + +namespace fastertransformer { + +template +void GptNeoXDecoder::initialize() +{ + self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer(0, // max_batch_size + head_num_, + size_per_head_, + rotary_embedding_dim_, + neox_rotary_style_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + 1, + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + false, // use_gated_activation = false; + custom_all_reduce_comm_, + enable_custom_all_reduce_); +} + +template +void GptNeoXDecoder::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void GptNeoXDecoder::allocateBuffer(size_t batch_size) +{ + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false)); + self_attn_output_ = + reinterpret_cast(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false)); + ffn_output_ = + reinterpret_cast(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false)); + decoder_layer_output_ = reinterpret_cast( + allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false)); + is_allocate_buffer_ = true; +} + +template +void GptNeoXDecoder::freeBuffer() +{ + if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); + allocator_->free((void**)(&self_attn_output_)); + allocator_->free((void**)(&ffn_output_)); + allocator_->free((void**)(&decoder_layer_output_)); + is_allocate_buffer_ = false; + } +} + +template +bool GptNeoXDecoder::isValidLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) + && (l < local_num_layer * (pipeline_para_.rank_ + 1)); +} + +template +bool GptNeoXDecoder::isFirstLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); +} + +template +bool GptNeoXDecoder::isLastLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); +} + +template +int GptNeoXDecoder::getFirstLayerParallelId() +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return local_num_layer * pipeline_para_.rank_; +} + +template +GptNeoXDecoder::GptNeoXDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + rotary_embedding_dim_(rotary_embedding_dim), + neox_rotary_style_(neox_rotary_style), + use_gptj_residual_(use_gptj_residual), + layernorm_eps_(layernorm_eps), + hidden_units_(head_num_ * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce) +{ + initialize(); +} + +template +GptNeoXDecoder::GptNeoXDecoder(GptNeoXDecoder const& decoder): + BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), + head_num_(decoder.head_num_), + size_per_head_(decoder.size_per_head_), + inter_size_(decoder.inter_size_), + num_layer_(decoder.num_layer_), + rotary_embedding_dim_(decoder.rotary_embedding_dim_), + neox_rotary_style_(decoder.neox_rotary_style_), + use_gptj_residual_(decoder.use_gptj_residual_), + layernorm_eps_(decoder.layernorm_eps_), + hidden_units_(decoder.hidden_units_), + tensor_para_(decoder.tensor_para_), + pipeline_para_(decoder.pipeline_para_), + custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), + enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) +{ + initialize(); +} + +template +GptNeoXDecoder::~GptNeoXDecoder() +{ + delete self_attention_layer_; + delete ffn_layer_; + freeBuffer(); +} + +template +void GptNeoXDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + FT_CHECK(false); +} + +template +void GptNeoXDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + // input tensors: + // decoder_input [local_batch_size, hidden_dimension], + // finished [local_batch_size], + // sequence_lengths [local_batch_size] + // total_padding_tokens [local_batch_size], + // max_input_length [1] on cpu + // d_prefix_prompt_lengths [local_batch_size], on GPU + // max_prefix_prompt_length [1] on cpu + // step [1] on cpu + // ite [1] on cpu + // cache_indirection [local_batch_size / beam_width, beam_width, memory_len] + // Here, local_batch_size contains the beam_width, so local_batch_size / beam_width + // is real local_batch_size. + // masked_tokens[local_batch_size, memory_len] + + // output tensors: + // decoder_output [local_batch_size, hidden_dimension], + // key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x] + // value_cache [num_layer, batch_size, head_num, memory_len, size_per_head] + + FT_CHECK(input_tensors->size() == 11); + FT_CHECK(output_tensors->size() == 3); + + const DataType data_type = getTensorType(); + const size_t local_batch_size = input_tensors->at("decoder_input").shape[0]; + allocateBuffer(local_batch_size); + const int ite = input_tensors->at("ite").getVal(); + + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + + Tensor& k_cache = output_tensors->at("key_cache"); + Tensor& v_cache = output_tensors->at("value_cache"); + std::vector self_k_cache_size; + self_k_cache_size.push_back(local_batch_size); + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + self_k_cache_size.push_back(*t); + } + std::vector self_v_cache_size; + self_v_cache_size.push_back(local_batch_size); + for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { + self_v_cache_size.push_back(*t); + } + + for (uint l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; + } + T* layer_input = (l == 0) ? decoder_input : decoder_layer_output_; + T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_; + + if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { + int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; + // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, + // stream_); + + ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ - 1, + pipeline_para_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); + } + } + + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + local_batch_size, + hidden_units_, + (float*)nullptr, + 0, + stream_); + sync_check_cuda_error(); + + TensorMap self_attention_input_tensors(*input_tensors); + self_attention_input_tensors.insert( + "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}); + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + size_t ite_cache_offset = ite * local_batch_size; + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + ite_cache_offset *= *t; + } + cache_offset += ite_cache_offset; + + TensorMap self_attention_output_tensors{ + {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &gpt_decoder_layer_weight->at(l)->self_attention_weights); + if (use_gptj_residual_) { + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + layernorm_eps_, + local_batch_size, + hidden_units_, + (float*)nullptr, + 0, + stream_); + } + else { + invokeGeneralAddBiasResidualPreLayerNorm( + self_attn_output_, + decoder_normed_input_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + local_batch_size, + hidden_units_, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + 0, + stream_); + } + + TensorMap ffn_input_tensors( + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}}); + TensorMap ffn_output_tensors({{"ffn_output", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size, hidden_units_}, + use_gptj_residual_ ? ffn_output_ : layer_output}}}); + ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + + if (use_gptj_residual_) { + // Original workflow: + // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) + // Our workflow: + // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size) + // They are equivalent on math, but we can use same buffer for layer_input and layer_output + invokeAddBiasAttentionFfnResidual(layer_output, + ffn_output_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + tensor_para_.world_size_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_); + } + } + else { + invokeAddBiasResidual(layer_output, + self_attn_output_, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + stream_); + } + + sync_check_cuda_error(); + + if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; + // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, + // stream_); + + ftNcclSend(layer_output + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ + 1, + pipeline_para_, + stream_); + } + } + + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } +} + +template class GptNeoXDecoder; +template class GptNeoXDecoder; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h new file mode 100644 index 000000000..add736adc --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoder.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/add_residual_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/BaseLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/utils/Tensor.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/utils/cublasMMWrapper.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace fastertransformer { + +template +class GptNeoXDecoder: public BaseLayer { +private: +protected: + void allocateBuffer() override; + void allocateBuffer(size_t batch_size); + void freeBuffer() override; + bool isValidLayerParallelId(uint l); + bool isFirstLayerParallelId(uint l); + bool isLastLayerParallelId(uint l); + int getFirstLayerParallelId(); + virtual void initialize(); + + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t rotary_embedding_dim_; + bool neox_rotary_style_; + bool use_gptj_residual_; + size_t hidden_units_; + float layernorm_eps_; + + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* ffn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + + BaseAttentionLayer* self_attention_layer_; + FfnLayer* ffn_layer_; + +public: + GptNeoXDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce_ = 0); + + GptNeoXDecoder(GptNeoXDecoder const& decoder); + + virtual ~GptNeoXDecoder(); + + virtual void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* decoder_layer_weights); + + virtual void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* decoder_layer_weights); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc new file mode 100644 index 000000000..3d62df83d --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" + +namespace fastertransformer { + +template +GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const int hidden_units, + const int inter_size, + const int tensor_para_size, + const int tensor_para_rank, + const bool use_gptj_residual): + hidden_units_(hidden_units), + inter_size_(inter_size), + tensor_para_size_(tensor_para_size), + tensor_para_rank_(tensor_para_rank), + use_gptj_residual_(use_gptj_residual) +{ + mallocWeights(); + setWeightPtr(); +} + +template +GptNeoXDecoderLayerWeight::~GptNeoXDecoderLayerWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < 12; i++) { + if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) { + cudaFree(weights_ptr[i]); + } + } + + pre_layernorm_weights.beta = nullptr; + pre_layernorm_weights.gamma = nullptr; + self_attention_weights.query_weight.kernel = nullptr; + self_attention_weights.query_weight.bias = nullptr; + self_attention_weights.attention_output_weight.kernel = nullptr; + self_attention_weights.attention_output_weight.bias = nullptr; + post_attention_layernorm_weights.beta = nullptr; + post_attention_layernorm_weights.gamma = nullptr; + + ffn_weights.intermediate_weight.kernel = nullptr; + ffn_weights.intermediate_weight.bias = nullptr; + ffn_weights.output_weight.kernel = nullptr; + ffn_weights.output_weight.bias = nullptr; + is_maintain_buffer = false; + } +} + +template +GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + use_gptj_residual_(other.use_gptj_residual_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + if (!use_gptj_residual_) { + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + } + + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + setWeightPtr(); +} + +template +GptNeoXDecoderLayerWeight& GptNeoXDecoderLayerWeight::operator=(const GptNeoXDecoderLayerWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + tensor_para_size_ = other.tensor_para_size_; + tensor_para_rank_ = other.tensor_para_rank_; + use_gptj_residual_ = other.use_gptj_residual_; + + mallocWeights(); + + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + if (!use_gptj_residual_) { + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + } + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + setWeightPtr(); + return *this; +} + +template +void GptNeoXDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) +{ + FT_CHECK(is_maintain_buffer == true); + const std::string rank_spec = std::to_string(tensor_para_rank_); + + loadWeightFromBin( + weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[3], + {(size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[4], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + if (!use_gptj_residual_) { + loadWeightFromBin( + weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type); + } + + loadWeightFromBin(weights_ptr[6], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", + model_file_type); + loadWeightFromBin(weights_ptr[7], + {(size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin", + model_file_type); + loadWeightFromBin(weights_ptr[8], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", + model_file_type); + if (use_gptj_residual_) { + loadWeightFromBin( + weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type); + } + else { + loadWeightFromBin( + weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type); + } + loadWeightFromBin( + weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); +} + +template +void GptNeoXDecoderLayerWeight::setWeightPtr() +{ + pre_layernorm_weights.beta = weights_ptr[0]; + pre_layernorm_weights.gamma = weights_ptr[1]; + self_attention_weights.query_weight.kernel = weights_ptr[2]; + self_attention_weights.query_weight.bias = weights_ptr[3]; + self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; + self_attention_weights.attention_output_weight.bias = use_gptj_residual_ ? nullptr : weights_ptr[5]; + + ffn_weights.intermediate_weight.kernel = weights_ptr[6]; + ffn_weights.intermediate_weight.bias = weights_ptr[7]; + ffn_weights.output_weight.kernel = weights_ptr[8]; + ffn_weights.output_weight.bias = weights_ptr[9]; + + post_attention_layernorm_weights.beta = weights_ptr[10]; + post_attention_layernorm_weights.gamma = weights_ptr[11]; + is_maintain_buffer = true; +} + +template +void GptNeoXDecoderLayerWeight::mallocWeights() +{ + deviceMalloc(&weights_ptr[0], hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + if (!use_gptj_residual_) { + deviceMalloc(&weights_ptr[5], hidden_units_); + } + + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); + deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); + deviceMalloc(&weights_ptr[9], hidden_units_); + deviceMalloc(&weights_ptr[10], hidden_units_); + deviceMalloc(&weights_ptr[11], hidden_units_); +} + +template struct GptNeoXDecoderLayerWeight; +template struct GptNeoXDecoderLayerWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h new file mode 100644 index 000000000..2850da466 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/FfnWeight.h" +#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" +#include "src/fastertransformer/utils/cuda_utils.h" + +namespace fastertransformer { + +template +struct GptNeoXDecoderLayerWeight { +public: + GptNeoXDecoderLayerWeight() = default; + GptNeoXDecoderLayerWeight(const int hidden_units, + const int inter_size, + const int tensor_para_size = 1, + const int tensor_para_rank = 0, + const bool use_gptj_residual = true); + ~GptNeoXDecoderLayerWeight(); + GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other); + GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other); + + void loadModel(std::string dir_path, FtCudaDataType model_file_type); + + LayerNormWeight pre_layernorm_weights; + AttentionWeight self_attention_weights; + LayerNormWeight post_attention_layernorm_weights; + FfnWeight ffn_weights; + +private: + int hidden_units_; + int inter_size_; + int tensor_para_size_; + int tensor_para_rank_; + bool use_gptj_residual_; + const int attention_dense_bias_weight_id = 5; + bool is_maintain_buffer = false; + T* weights_ptr[12]; + + void setWeightPtr(); + void mallocWeights(); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc new file mode 100644 index 000000000..26995f255 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h" + +namespace fastertransformer { + +template +GptNeoXWeight::GptNeoXWeight(const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int max_seq_len, + const int tensor_para_size, + const int tensor_para_rank, + const int layer_para_size, + const int layer_para_rank, + const bool use_gptj_residual, + PromptLearningType prompt_learning_type, + std::map> prompt_learning_pair): + hidden_units_(hidden_units), + inter_size_(inter_size), + vocab_size_(vocab_size), + num_layer_(num_layer), + max_seq_len_(max_seq_len), + tensor_para_size_(tensor_para_size), + tensor_para_rank_(tensor_para_rank), + layer_para_size_(layer_para_size), + layer_para_rank_(layer_para_rank), + use_gptj_residual_(use_gptj_residual), + prompt_learning_type_(prompt_learning_type), + prompt_learning_pair_(prompt_learning_pair) +{ + FT_CHECK(num_layer_ % layer_para_size_ == 0); + // set prompt weight size + if (prompt_learning_type_ == PromptLearningType::prefix_prompt) { + prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_; + } + else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) { + prompt_token_weight_size_ = hidden_units_; + } + + // set if load and malloc prompt weights + malloc_load_prompt_weights_ = !prompt_learning_pair_.empty() + && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning + || prompt_learning_type_ == PromptLearningType::prefix_prompt); + + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight( + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + } + else { + // Layer-parallelism: allocate empty layer because + // this rank does not compute it: + decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight(0, 0)); + } + } + + mallocWeights(); + setWeightPtr(); +} + +template +GptNeoXWeight::~GptNeoXWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < weights_ptr.size(); i++) { + deviceFree(weights_ptr[i]); + } + + pre_decoder_embedding_table = nullptr; + post_decoder_layernorm.beta = nullptr; + post_decoder_layernorm.gamma = nullptr; + post_decoder_embedding.kernel = nullptr; + is_maintain_buffer = false; + } +} + +template +GptNeoXWeight::GptNeoXWeight(const GptNeoXWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + vocab_size_(other.vocab_size_), + num_layer_(other.num_layer_), + max_seq_len_(other.max_seq_len_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + layer_para_size_(other.layer_para_size_), + layer_para_rank_(other.layer_para_rank_), + use_gptj_residual_(other.use_gptj_residual_), + prompt_token_weight_size_(other.prompt_token_weight_size_), + malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), + prompt_learning_type_(other.prompt_learning_type_), + prompt_learning_pair_(other.prompt_learning_pair_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning table: malloc weights and set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t prompt_id = num_base_weights + (size_t)task_name_id; + + // cuda device to device memcpy prompt table weights buffer memory + cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); + } + } + + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } +} + +template +GptNeoXWeight& GptNeoXWeight::operator=(const GptNeoXWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + vocab_size_ = other.vocab_size_; + num_layer_ = other.num_layer_; + max_seq_len_ = other.max_seq_len_; + tensor_para_size_ = other.tensor_para_size_; + tensor_para_rank_ = other.tensor_para_rank_; + layer_para_size_ = other.layer_para_size_; + layer_para_rank_ = other.layer_para_rank_; + use_gptj_residual_ = other.use_gptj_residual_; + prompt_token_weight_size_ = other.prompt_token_weight_size_; + malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; + prompt_learning_type_ = other.prompt_learning_type_; + prompt_learning_pair_ = other.prompt_learning_pair_; + + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning table: malloc weights and set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t prompt_id = num_base_weights + (size_t)task_name_id; + + // cuda device to device memcpy prompt table weights buffer memory + cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); + } + } + + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } + return *this; +} + +template +void GptNeoXWeight::setWeightPtr() +{ + prompt_learning_table.resize(prompt_learning_pair_.size()); + + pre_decoder_embedding_table = weights_ptr[0]; + post_decoder_layernorm.beta = weights_ptr[1]; + post_decoder_layernorm.gamma = weights_ptr[2]; + post_decoder_embedding.kernel = weights_ptr[3]; + + // prompt learning tables: set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + // set weight ptr + prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length}; + } + } +} + +template +void GptNeoXWeight::mallocWeights() +{ + weights_ptr.resize(num_base_weights + prompt_learning_pair_.size()); + + deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_); + deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning tables: malloc weights + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + // malloc weights + T* prompt_weights_ptr = nullptr; + deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_); + weights_ptr[task_weight_id] = prompt_weights_ptr; + } + } + is_maintain_buffer = true; +} + +template +void GptNeoXWeight::loadModel(std::string dir_path) +{ + FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "gptneox"); + FT_CHECK(is_maintain_buffer == true); + + loadWeightFromBin( + weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type); + loadWeightFromBin( + weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type); + loadWeightFromBin(weights_ptr[3], + {(size_t)(vocab_size_ * hidden_units_)}, + dir_path + "/model.lm_head.weight.bin", + model_file_type); + + // prompt table: load weights from bin + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ? + (dir_path + "/model.prompt_table." + task_name + ".weight.bin") : + (dir_path + "/model.prefix_prompt." + task_name + ".weight." + + std::to_string(tensor_para_rank_) + ".bin"); + + if (prompt_length > 0) { + loadWeightFromBin(weights_ptr[task_weight_id], + {(size_t)(prompt_length * (int)prompt_token_weight_size_)}, + prompt_weight_path_name, + model_file_type); + } + } + } + + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type); + } + } +} + +template +void GptNeoXWeight::resizeLayer(const int num_layer) +{ + num_layer_ = num_layer; + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight()); + } +} + +template +bool GptNeoXWeight::isValidLayerParallelId(int l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_)); + return l < num_layer_ && (l >= local_num_layer * layer_para_rank_) + && (l < local_num_layer * (layer_para_rank_ + 1)); +} + +template struct GptNeoXWeight; +template struct GptNeoXWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h new file mode 100644 index 000000000..3e868854e --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" +#include "src/fastertransformer/utils/prompt_learning.h" + +namespace fastertransformer { + +template +struct GptNeoXWeight { + + GptNeoXWeight() = default; + GptNeoXWeight( + const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int max_seq_len, + const int tensor_para_size = 1, + const int tensor_para_rank = 0, + const int layer_para_size = 1, + const int layer_para_rank = 0, + const bool use_gptj_residual_ = true, + PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, + std::map> prompt_learning_pair = std::map>{}); + + ~GptNeoXWeight(); + GptNeoXWeight(const GptNeoXWeight& other); + GptNeoXWeight& operator=(const GptNeoXWeight& other); + + void loadModel(std::string dir_path); + + void resizeLayer(const int num_layer); + + std::vector*> decoder_layer_weights; + const T* pre_decoder_embedding_table = nullptr; + // GPT-J does not use embedding table, but we leave the ptr such that + // GptNeoX::forward and Gpt::forward become identical + const T* position_encoding_table = nullptr; + + /* + prompt_learning_pair = vectors of [weight ptr, prompt length] pair + prompt_length is stored here for compatible prompt learning table + prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] + p/prompt tuning weights store as shape [prompt_len, hidden_units] + idx is the task_name_id of the prompt tables + */ + std::vector> prompt_learning_table = {}; + + LayerNormWeight post_decoder_layernorm; + DenseWeight post_decoder_embedding; + + inline void setMaxSeqLen(size_t max_seq_len) + { + max_seq_len_ = max_seq_len; + } + +private: + void setWeightPtr(); + void mallocWeights(); + bool isValidLayerParallelId(int l); + + int hidden_units_; + int inter_size_; + int vocab_size_; + int num_layer_; + int max_seq_len_; + + int tensor_para_size_; + int tensor_para_rank_; + int layer_para_size_; + int layer_para_rank_; + + // residual type + bool use_gptj_residual_; + + // prompt learning pair (task_name, (task_name_id, prompt_len)) + PromptLearningType prompt_learning_type_; + std::map> prompt_learning_pair_; + bool malloc_load_prompt_weights_ = false; + // each prompt token's weight size + size_t prompt_token_weight_size_ = 0; + + bool is_maintain_buffer = false; + const size_t num_base_weights = 4; + std::vector weights_ptr = std::vector(num_base_weights); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/th_op/llama/CMakeLists.txt b/src/fastertransformer/th_op/llama/CMakeLists.txt new file mode 100755 index 000000000..75d13790e --- /dev/null +++ b/src/fastertransformer/th_op/llama/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(th_llama STATIC LLaMA.cc) +set_property(TARGET th_llama PROPERTY POSITION_INDEPENDENT_CODE ON) +target_link_libraries(th_llama PRIVATE "${TORCH_LIBRARIES}" LLaMA th_utils nccl_utils) diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc new file mode 100755 index 000000000..e913570cd --- /dev/null +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/th_op/llama/LLaMA.h" + +namespace th = torch; +namespace ft = fastertransformer; +namespace torch_ext { + +LLaMA::LLaMA(const int64_t head_num, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t layer_num, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t start_id, + const int64_t end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const int64_t max_seq_len, + const bool use_gptj_residual, + const vector weights): + st_(weights[0].scalar_type()) +{ + for (auto t : weights) { + CHECK_INPUT(t, st_); + } + + switch (st_) { + case at::ScalarType::Float: + ftgpt = new FTGptNeoX((size_t)head_num, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)layer_num, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + start_id, + end_id, + tensor_para_size, + pipeline_para_size, + (size_t)max_seq_len, + use_gptj_residual, + weights); + break; + case at::ScalarType::Half: + ftgpt = new FTGptNeoX((size_t)head_num, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)layer_num, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + start_id, + end_id, + tensor_para_size, + pipeline_para_size, + (size_t)max_seq_len, + use_gptj_residual, + weights); + break; + default: + throw std::runtime_error("Wrong Tensor type."); + } +} + +LLaMA::~LLaMA() +{ + delete ftgpt; +} + +std::vector LLaMA::forward(th::Tensor input_ids, + th::Tensor input_lengths, + const int64_t output_len, + th::optional beam_width_opt, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) +{ + CHECK_TH_CUDA(input_ids); + CHECK_CONTIGUOUS(input_ids); + TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32"); + CHECK_TH_CUDA(input_lengths); + CHECK_CONTIGUOUS(input_lengths); + TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); + int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0; + if (return_cum_log_probs_opt.has_value()) { + TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1, + "return_cum_log_probs should be" + " 0 (no return cum_log_probs), " + " 1 (the cumulative log probs of generated sequences)") + } + + const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1; + + const int batch_size = input_ids.size(0); + const int max_input_length = input_ids.size(1); + const int total_request_output_len = max_input_length + output_len; + th::Tensor output_ids = torch::empty({batch_size, beam_width, total_request_output_len}, + torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); + th::Tensor sequence_lengths = + torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); + th::Tensor cum_log_probs = + torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); + + ftgpt->forward(input_ids, + input_lengths, + output_ids, + sequence_lengths, + cum_log_probs, + (const size_t)output_len, + (const size_t)beam_width, + top_k_opt, + top_p_opt, + beam_search_diversity_rate_opt, + temperature_opt, + len_penalty_opt, + repetition_penalty_opt, + random_seed_opt, + return_cum_log_probs_opt); + if (return_cum_log_probs > 0) { + return std::vector{output_ids, sequence_lengths, cum_log_probs}; + } + return std::vector{output_ids, sequence_lengths}; +} + +} // namespace torch_ext + +static auto fasterTransformerGptTHS = +#ifdef LEGACY_THS + torch::jit::class_("FasterTransformerLLaMA") +#else + torch::jit::class_("FasterTransformer", "LLaMA") +#endif + .def(torch::jit::init>()) + .def("forward", &torch_ext::LLaMA::forward); diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h new file mode 100755 index 000000000..3cca0bb19 --- /dev/null +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMA.h" +#include "src/fastertransformer/th_op/th_utils.h" +#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace ft = fastertransformer; +namespace th = torch; +namespace torch_ext { + +using std::vector; + +class IFLLaMA { +public: + virtual ~IFLLaMA() {} + virtual void forward(th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& output_ids, + th::Tensor& sequence_lengths, + th::Tensor& cum_log_probs, + const size_t request_output_len, + const size_t beam_width, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) = 0; +}; + +template +class FTLLaMA: public IFLLaMA { +public: + FTLLaMA(const size_t head_num, + const size_t size_per_head, + const size_t inter_size, + const size_t layer_num, + const size_t vocab_size, + const size_t rotary_embedding_dim, + const int start_id, + const int end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const size_t max_seq_len, + const bool use_gptj_residual, + const vector weights): + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + layer_num_(layer_num), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + use_gptj_residual_(use_gptj_residual), + weights_(weights), + tensor_para_size_(tensor_para_size), + pipeline_para_size_(pipeline_para_size) + { + ft::check_cuda_error(cublasLtCreate(&cublasltHandle_)); + cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); + cublas_wrapper_mutex_ = new std::mutex(); + + ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size); + + gpt_weights_.resizeLayer(layer_num_); + for (int i = 0; i < (int)layer_num_; i++) { + gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = + get_ptr(weights_[i + 0 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = + get_ptr(weights_[i + 1 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = + get_ptr(weights_[i + 2 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = + get_ptr(weights_[i + 3 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = + get_ptr(weights_[i + 4 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = + get_ptr(weights_[i + 5 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = + get_ptr(weights_[i + 6 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = + get_ptr(weights_[i + 7 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = + get_ptr(weights_[i + 8 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = + get_ptr(weights_[i + 9 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = + get_ptr(weights_[i + 10 * layer_num_]); + gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = + get_ptr(weights_[i + 11 * layer_num_]); + } + + gpt_weights_.pre_decoder_embedding_table = get_ptr(weights_[12 * layer_num_ + 0]); + gpt_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[12 * layer_num_ + 1]); + gpt_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); + gpt_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); + + gpt_weights_.setMaxSeqLen(max_seq_len); + + ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); + } + + ~FTLLaMA() override + { + ft::ftNcclParamDestroy(tensor_para_); + ft::ftNcclParamDestroy(pipeline_para_); + cublasLtDestroy(cublasltHandle_); + delete cublas_algo_map_; + delete cublas_wrapper_mutex_; + } + + void forward(th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& output_ids, + th::Tensor& sequence_lengths, + th::Tensor& cum_log_probs, + const size_t request_output_len, + const size_t beam_width, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt) override + { + int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0; + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); + cublasSetStream(cublasHandle, stream); + ft::Allocator allocator = ft::Allocator(); + ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper( + cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator); + + if (std::is_same::value) { + cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper.setFP32GemmConfig(); + } + + const size_t request_batch_size = (size_t)input_ids.size(0); + const size_t max_input_length = (size_t)input_ids.size(1); + const int total_output_len = (int)(max_input_length + request_output_len); + + ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, + ft::getSMVersion(), + true, // remove_padding + 0, // gpt supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + ft::LLaMA gpt = ft::LLaMA(head_num_, + size_per_head_, + inter_size_, + layer_num_, + vocab_size_, + rotary_embedding_dim_, + start_id_, + end_id_, + end_id_ + 1, // p/prompt tuning virtual token start id + ft::PromptLearningType::no_prompt, + use_gptj_residual_, + 0.0f, // beam_search_diversity_rate, + 1, // top_k, + 0.0, // top_p, + 0, // random_seed, + 1.0f, // temperature, + 1.0f, // len_penalty, + 1.0f, // repetition_penalty, + tensor_para_, + pipeline_para_, + stream, + &cublas_wrapper, + &allocator, + false, // is_free_buffer_after_forward + &prop_, // cuda_device_prop + attention_type, // attention_type + nullptr, // custom_all_reduce_comm + 0); // enable_custom_all_reduce + + std::vector output_seq_len(request_batch_size, total_output_len); + + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, max_input_length}, + get_ptr(input_ids)}}, + {"input_lengths", + ft::Tensor{ + ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{request_batch_size}, get_ptr(input_lengths)}}, + {"output_seq_len", + ft::Tensor{ + ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}}; + if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) { + input_tensors.insert( + {"beam_search_diversity_rate", + convert_tensor(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (top_p_opt.has_value()) { + input_tensors.insert( + {"runtime_top_p", convert_tensor(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (top_k_opt.has_value()) { + input_tensors.insert( + {"runtime_top_k", convert_tensor(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (temperature_opt.has_value()) { + input_tensors.insert( + {"temperature", convert_tensor(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (len_penalty_opt.has_value()) { + input_tensors.insert( + {"len_penalty", convert_tensor(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (repetition_penalty_opt.has_value()) { + input_tensors.insert({"repetition_penalty", + convert_tensor(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + if (random_seed_opt.has_value()) { + input_tensors.insert( + {"random_seed", + convert_tensor(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)}); + } + + std::unordered_map output_tensors = std::unordered_map{ + {"output_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, beam_width, (size_t)total_output_len}, + get_ptr(output_ids)}}, + {"sequence_length", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{request_batch_size, beam_width}, + get_ptr(sequence_lengths)}}}; + + if (return_cum_log_probs > 0) { + output_tensors.insert({"cum_log_probs", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{request_batch_size, beam_width}, + get_ptr(cum_log_probs)}}); + } + + try { + gpt.forward(&output_tensors, &input_tensors, &gpt_weights_); + } + catch (std::runtime_error& error) { + std::cout << error.what(); + exit(-1); + } + catch (...) { + std::cout << "Runtime error"; + exit(-1); + } + } + +private: + const size_t head_num_; + const size_t size_per_head_; + const size_t inter_size_; + const size_t layer_num_; + const size_t vocab_size_; + const size_t rotary_embedding_dim_; + const int start_id_; + const int end_id_; + const bool use_gptj_residual_; + + // const ft::gptVariantParams gpt_variant_params_; + + std::vector weights_; + cublasLtHandle_t cublasltHandle_; + std::mutex* cublas_wrapper_mutex_; + ft::cublasAlgoMap* cublas_algo_map_; + struct cudaDeviceProp prop_; + ft::LLaMAWeight gpt_weights_; + + ft::NcclParam tensor_para_; + ft::NcclParam pipeline_para_; + + int64_t tensor_para_size_; + int64_t pipeline_para_size_; +}; + +class LLaMA: public th::jit::CustomClassHolder { +public: + LLaMA(const int64_t head_num, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t layer_num, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t start_id, + const int64_t end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const int64_t max_seq_len, + const bool use_gptj_residual, + const vector weights); + + ~LLaMA(); + + vector forward(th::Tensor input_ids, + th::Tensor input_lengths, + const int64_t output_len, + th::optional beam_width_opt, + th::optional top_k_opt, + th::optional top_p_opt, + th::optional beam_search_diversity_rate_opt, + th::optional temperature_opt, + th::optional len_penalty_opt, + th::optional repetition_penalty_opt, + th::optional random_seed_opt, + th::optional return_cum_log_probs_opt); + +private: + const at::ScalarType st_; + IFLLaMA* ftgpt; + std::vector weights; +}; + +} // namespace torch_ext From a0276fbc0a08c409a0663042ba91516c8d1cf1ca Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 12 Sep 2023 00:24:21 +0000 Subject: [PATCH 03/55] rename varaible --- src/fastertransformer/models/llama/LLaMA.cc | 128 +++++++++--------- src/fastertransformer/models/llama/LLaMA.h | 24 ++-- .../models/llama/LLaMAContextDecoder.cc | 60 ++++---- .../models/llama/LLaMAContextDecoder.h | 14 +- .../models/llama/LLaMADecoder.cc | 58 ++++---- .../models/llama/LLaMADecoder.h | 14 +- .../models/llama/LLaMADecoderLayerWeight.cc | 20 +-- .../models/llama/LLaMADecoderLayerWeight.h | 12 +- .../models/llama/LLaMAWeight.cc | 32 ++--- .../models/llama/LLaMAWeight.h | 18 +-- 10 files changed, 190 insertions(+), 190 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 2ce2dae7b..4ac26a473 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/fastertransformer/models/gptneox/GptNeoX.h" +#include "src/fastertransformer/models/llama/LLaMA.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" @@ -24,9 +24,9 @@ namespace fastertransformer { template -void GptNeoX::initialize() +void LLaMA::initialize() { - gpt_context_decoder_ = new GptNeoXContextDecoder(head_num_, + llama_context_decoder_ = new LLaMAContextDecoder(head_num_, size_per_head_, inter_size_, num_layer_, @@ -45,7 +45,7 @@ void GptNeoX::initialize() custom_all_reduce_comm_, enable_custom_all_reduce_); - gpt_decoder_ = new GptNeoXDecoder(head_num_, + llama_decoder_ = new LLaMADecoder(head_num_, size_per_head_, inter_size_, num_layer_, @@ -73,13 +73,13 @@ void GptNeoX::initialize() } template -void GptNeoX::allocateBuffer() +void LLaMA::allocateBuffer() { FT_CHECK(false); } template -void GptNeoX::allocateBuffer( +void LLaMA::allocateBuffer( size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -154,7 +154,7 @@ void GptNeoX::allocateBuffer( } template -void GptNeoX::freeBuffer() +void LLaMA::freeBuffer() { if (is_allocate_buffer_) { if (vocab_size_ != vocab_size_padded_) { @@ -206,7 +206,7 @@ void GptNeoX::freeBuffer() } template -GptNeoX::GptNeoX(size_t head_num, +LLaMA::LLaMA(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -262,7 +262,7 @@ GptNeoX::GptNeoX(size_t head_num, } template -GptNeoX::GptNeoX(size_t head_num, +LLaMA::LLaMA(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -319,66 +319,66 @@ GptNeoX::GptNeoX(size_t head_num, } template -GptNeoX::GptNeoX(GptNeoX const& gpt): - BaseLayer(gpt), - head_num_(gpt.head_num_), - size_per_head_(gpt.size_per_head_), - inter_size_(gpt.inter_size_), - num_layer_(gpt.num_layer_), - vocab_size_(gpt.vocab_size_), - rotary_embedding_dim_(gpt.rotary_embedding_dim_), - start_id_(gpt.start_id_), - end_id_(gpt.end_id_), - prompt_learning_start_id_(gpt.prompt_learning_start_id_), - prompt_learning_type_(gpt.prompt_learning_type_), - use_gptj_residual_(gpt.use_gptj_residual_), - hidden_units_(gpt.hidden_units_), - tensor_para_(gpt.tensor_para_), - pipeline_para_(gpt.pipeline_para_), - local_head_num_(gpt.local_head_num_), - vocab_size_padded_(gpt.vocab_size_padded_), - custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), - enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), - attention_type_(gpt.attention_type_) +LLaMA::LLaMA(LLaMA const& llama): + BaseLayer(llama), + head_num_(llama.head_num_), + size_per_head_(llama.size_per_head_), + inter_size_(llama.inter_size_), + num_layer_(llama.num_layer_), + vocab_size_(llama.vocab_size_), + rotary_embedding_dim_(llama.rotary_embedding_dim_), + start_id_(llama.start_id_), + end_id_(llama.end_id_), + prompt_learning_start_id_(llama.prompt_learning_start_id_), + prompt_learning_type_(llama.prompt_learning_type_), + use_gptj_residual_(llama.use_gptj_residual_), + hidden_units_(llama.hidden_units_), + tensor_para_(llama.tensor_para_), + pipeline_para_(llama.pipeline_para_), + local_head_num_(llama.local_head_num_), + vocab_size_padded_(llama.vocab_size_padded_), + custom_all_reduce_comm_(llama.custom_all_reduce_comm_), + enable_custom_all_reduce_(llama.enable_custom_all_reduce_), + attention_type_(llama.attention_type_) { initialize(); } template -GptNeoX::~GptNeoX() +LLaMA::~LLaMA() { - delete gpt_decoder_; + delete llama_decoder_; delete dynamic_decode_layer_; - delete gpt_context_decoder_; + delete llama_context_decoder_; freeBuffer(); } template -void GptNeoX::registerCallback(callback_sig* fn, void* ctx) +void LLaMA::registerCallback(callback_sig* fn, void* ctx) { token_generated_cb_ = fn; token_generated_ctx_ = ctx; } template -void GptNeoX::unRegisterCallback() +void LLaMA::unRegisterCallback() { token_generated_cb_ = nullptr; token_generated_ctx_ = nullptr; } template -void GptNeoX::forward(std::vector* output_tensors, +void LLaMA::forward(std::vector* output_tensors, const std::vector* input_tensors, - const GptNeoXWeight* gpt_weights) + const LLaMAWeight* llama_weights) { FT_CHECK(false); } template -void GptNeoX::forward(std::unordered_map* output_tensors, +void LLaMA::forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const GptNeoXWeight* gpt_weights) + const LLaMAWeight* llama_weights) { // input_tensors: // input_ids [batch_size, max_input_length] @@ -478,7 +478,7 @@ void GptNeoX::forward(std::unordered_map* output_t // throw errors when prompt task_name_ids are not found std::pair prefix_prompt_weight_length_pair; try { - prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id); + prefix_prompt_weight_length_pair = llama_weights->prompt_learning_table.at(task_id); } catch (const std::out_of_range& oor) { FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id); @@ -594,8 +594,8 @@ void GptNeoX::forward(std::unordered_map* output_t param.from_tensor = context_decoder_input_buf_; param.output_ids = output_ids_buf_; param.input_lengths = tiled_input_lengths_buf_; - param.embedding_table = gpt_weights->pre_decoder_embedding_table; - param.pos_table = gpt_weights->position_encoding_table; + param.embedding_table = llama_weights->pre_decoder_embedding_table; + param.pos_table = llama_weights->position_encoding_table; param.prefix_soft_prompt_embedding = input_tensors->at("request_prompt_embedding").getPtr(); param.prefix_soft_prompt_lengths = input_tensors->at("request_prompt_lengths").getPtr(); param.input_ids = tiled_input_ids_buf_; @@ -614,8 +614,8 @@ void GptNeoX::forward(std::unordered_map* output_t else { invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, output_ids_buf_, - gpt_weights->pre_decoder_embedding_table, - gpt_weights->position_encoding_table, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, pPromptTuningParam{}, // no p/prompt tuning tiled_input_ids_buf_, 1, @@ -673,8 +673,8 @@ void GptNeoX::forward(std::unordered_map* output_t {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; - gpt_context_decoder_->forward( - &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + llama_context_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); invokeDecodingInitialize(finished_buf_, sequence_lengths_, @@ -739,16 +739,16 @@ void GptNeoX::forward(std::unordered_map* output_t } if (vocab_size_ == vocab_size_padded_) { - padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel; + padded_embedding_kernel_ptr_ = llama_weights->post_decoder_embedding.kernel; } else { cudaMemcpyAsync(padded_embedding_kernel_, - gpt_weights->post_decoder_embedding.kernel, + llama_weights->post_decoder_embedding.kernel, sizeof(T) * vocab_size_ * hidden_units_, cudaMemcpyDeviceToDevice, stream_); cudaMemcpyAsync(padded_embedding_bias_, - gpt_weights->post_decoder_embedding.bias, + llama_weights->post_decoder_embedding.bias, sizeof(T) * vocab_size_, cudaMemcpyDeviceToDevice, stream_); @@ -782,8 +782,8 @@ void GptNeoX::forward(std::unordered_map* output_t if (!(max_input_length > 1 && step == max_input_length)) { if (pipeline_para_.rank_ == 0) { invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset, - gpt_weights->pre_decoder_embedding_table, - gpt_weights->position_encoding_table, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, output_ids_buf_ + id_offset, tiled_total_padding_count_ + id_offset, local_batch_size * beam_width, @@ -838,15 +838,15 @@ void GptNeoX::forward(std::unordered_map* output_t decoder_output_buf_ + hidden_units_offset}}, {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; - gpt_decoder_->forward( - &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + llama_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); } if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset, decoder_output_buf_ + hidden_units_offset, - gpt_weights->post_decoder_layernorm.gamma, - gpt_weights->post_decoder_layernorm.beta, + llama_weights->post_decoder_layernorm.gamma, + llama_weights->post_decoder_layernorm.beta, layernorm_eps_, local_batch_size * beam_width, hidden_units_, @@ -1045,7 +1045,7 @@ void GptNeoX::forward(std::unordered_map* output_t } template -void GptNeoX::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, +void LLaMA::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -1080,7 +1080,7 @@ void GptNeoX::sendTensorsToFirstPipelineNode(std::unordered_map -void GptNeoX::setOutputTensors(std::unordered_map* output_tensors, +void LLaMA::setOutputTensors(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, const size_t max_input_length, const size_t max_output_seq_len) @@ -1176,36 +1176,36 @@ void GptNeoX::setOutputTensors(std::unordered_map* } template -size_t GptNeoX::getPipelineParallelRank() +size_t LLaMA::getPipelineParallelRank() { return pipeline_para_.rank_; } template -size_t GptNeoX::getPipelineParallelSize() +size_t LLaMA::getPipelineParallelSize() { return pipeline_para_.world_size_; } template -size_t GptNeoX::getTensorParallelRank() +size_t LLaMA::getTensorParallelRank() { return tensor_para_.rank_; } template -size_t GptNeoX::getTensorParallelSize() +size_t LLaMA::getTensorParallelSize() { return tensor_para_.world_size_; } template -bool* GptNeoX::getFinishBuffer() +bool* LLaMA::getFinishBuffer() { return finished_buf_; } -template class GptNeoX; -template class GptNeoX; +template class LLaMA; +template class LLaMA; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 9749a2070..5cf7b0025 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -20,16 +20,16 @@ #include #include "src/fastertransformer/layers/DynamicDecodeLayer.h" -#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h" -#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h" -#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h" +#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" +#include "src/fastertransformer/models/llama/LLaMADecoder.h" +#include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/fastertransformer/utils/prompt_learning.h" namespace fastertransformer { template -class GptNeoX: public BaseLayer { +class LLaMA: public BaseLayer { private: // meta data size_t head_num_; @@ -69,8 +69,8 @@ class GptNeoX: public BaseLayer { bool has_prefix_prompt_; bool has_prefix_soft_prompt_; - GptNeoXDecoder* gpt_decoder_; - GptNeoXContextDecoder* gpt_context_decoder_; + LLaMADecoder* llama_decoder_; + LLaMAContextDecoder* llama_context_decoder_; DynamicDecodeLayer* dynamic_decode_layer_; void allocateBuffer() override; @@ -137,7 +137,7 @@ class GptNeoX: public BaseLayer { const std::unordered_map* input_tensors); public: - GptNeoX(size_t head_num, + LLaMA(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -164,7 +164,7 @@ class GptNeoX: public BaseLayer { std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce = 0); - GptNeoX(size_t head_num, + LLaMA(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -193,17 +193,17 @@ class GptNeoX: public BaseLayer { std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce = 0); - GptNeoX(GptNeoX const& GptNeoX); + LLaMA(LLaMA const& LLaMA); - ~GptNeoX(); + ~LLaMA(); void forward(std::vector* output_tensors, const std::vector* input_tensors, - const GptNeoXWeight* gpt_weights); + const LLaMAWeight* llama_weights); void forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const GptNeoXWeight* gpt_weights); + const LLaMAWeight* llama_weights); size_t getPipelineParallelRank(); size_t getPipelineParallelSize(); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index f23d1a977..69ed839a3 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h" +#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" @@ -24,7 +24,7 @@ namespace fastertransformer { template -void GptNeoXContextDecoder::initialize() +void LLaMAContextDecoder::initialize() { self_attention_layer_ = new TensorParallelGptContextAttentionLayer(0, // max_batch_size 0, // max_seq_len @@ -64,13 +64,13 @@ void GptNeoXContextDecoder::initialize() } template -void GptNeoXContextDecoder::allocateBuffer() +void LLaMAContextDecoder::allocateBuffer() { FT_CHECK(false); } template -void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -88,7 +88,7 @@ void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) } template -void GptNeoXContextDecoder::freeBuffer() +void LLaMAContextDecoder::freeBuffer() { if (is_allocate_buffer_ == true) { allocator_->free((void**)(&decoder_normed_input_)); @@ -103,7 +103,7 @@ void GptNeoXContextDecoder::freeBuffer() } template -bool GptNeoXContextDecoder::isValidLayerParallelId(uint l) +bool LLaMAContextDecoder::isValidLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) @@ -111,28 +111,28 @@ bool GptNeoXContextDecoder::isValidLayerParallelId(uint l) } template -bool GptNeoXContextDecoder::isFirstLayerParallelId(uint l) +bool LLaMAContextDecoder::isFirstLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); } template -bool GptNeoXContextDecoder::isLastLayerParallelId(uint l) +bool LLaMAContextDecoder::isLastLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); } template -int GptNeoXContextDecoder::getFirstLayerParallelId() +int LLaMAContextDecoder::getFirstLayerParallelId() { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return local_num_layer * pipeline_para_.rank_; } template -GptNeoXContextDecoder::GptNeoXContextDecoder(size_t head_num, +LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -171,7 +171,7 @@ GptNeoXContextDecoder::GptNeoXContextDecoder(size_t } template -GptNeoXContextDecoder::GptNeoXContextDecoder(GptNeoXContextDecoder const& decoder): +LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decoder): BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), head_num_(decoder.head_num_), size_per_head_(decoder.size_per_head_), @@ -193,7 +193,7 @@ GptNeoXContextDecoder::GptNeoXContextDecoder(GptNeoXContextDecoder const& } template -GptNeoXContextDecoder::~GptNeoXContextDecoder() +LLaMAContextDecoder::~LLaMAContextDecoder() { delete self_attention_layer_; delete ffn_layer_; @@ -201,9 +201,9 @@ GptNeoXContextDecoder::~GptNeoXContextDecoder() } template -void GptNeoXContextDecoder::forward(std::vector* output_tensors, +void LLaMAContextDecoder::forward(std::vector* output_tensors, const std::vector* input_tensors, - const std::vector*>* gpt_decoder_layer_weight) + const std::vector*>* llama_decoder_layer_weight) { std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, {"attention_mask", input_tensors->at(1)}, @@ -213,13 +213,13 @@ void GptNeoXContextDecoder::forward(std::vector* {"value_cache", output_tensors->at(2)}, {"last_token_hidden_units", output_tensors->at(3)}}; - forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight); + forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight); } template -void GptNeoXContextDecoder::forward(std::unordered_map* output_tensors, +void LLaMAContextDecoder::forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const std::vector*>* gpt_decoder_layer_weight) + const std::vector*>* llama_decoder_layer_weight) { // input tensors: // decoder_input [batch_size, seq_len, hidden_dimension], @@ -332,8 +332,8 @@ void GptNeoXContextDecoder::forward(std::unordered_map* invokeGeneralLayerNorm(decoder_normed_input_, layer_input, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, layernorm_eps_, h_token_num, hidden_units_, @@ -393,14 +393,14 @@ void GptNeoXContextDecoder::forward(std::unordered_map* self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, - &gpt_decoder_layer_weight->at(l)->self_attention_weights); + &llama_decoder_layer_weight->at(l)->self_attention_weights); if (is_final == false) { if (use_gptj_residual_) { invokeGeneralLayerNorm(decoder_normed_input_, layer_input, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, layernorm_eps_, h_token_num, hidden_units_, @@ -414,9 +414,9 @@ void GptNeoXContextDecoder::forward(std::unordered_map* decoder_normed_input_, self_attn_output_, layer_input, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, layernorm_eps_, h_token_num, hidden_units_, @@ -437,7 +437,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* {h_token_num, (size_t)hidden_units_}, use_gptj_residual_ ? ffn_output_ : layer_output}}}); ffn_layer_->forward( - &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); if (use_gptj_residual_) { // Original workflow: @@ -451,7 +451,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* ffn_output_, self_attn_output_, layer_input, - gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, h_token_num, hidden_units_, tensor_para_.world_size_, @@ -464,7 +464,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* else { invokeAddBiasResidual(layer_output, self_attn_output_, - gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, h_token_num, hidden_units_, stream_); @@ -508,7 +508,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* } } -template class GptNeoXContextDecoder; -template class GptNeoXContextDecoder; +template class LLaMAContextDecoder; +template class LLaMAContextDecoder; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index c81dcfe90..b84285f14 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -23,7 +23,7 @@ #include "src/fastertransformer/layers/BaseLayer.h" #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" -#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" #include "src/fastertransformer/utils/Tensor.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/utils/cublasMMWrapper.h" @@ -33,7 +33,7 @@ namespace fastertransformer { template -class GptNeoXContextDecoder: public BaseLayer { +class LLaMAContextDecoder: public BaseLayer { private: // meta data size_t head_num_; @@ -82,7 +82,7 @@ class GptNeoXContextDecoder: public BaseLayer { int* cu_seqlens_ = nullptr; public: - GptNeoXContextDecoder(size_t head_num, + LLaMAContextDecoder(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -101,17 +101,17 @@ class GptNeoXContextDecoder: public BaseLayer { std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); - GptNeoXContextDecoder(GptNeoXContextDecoder const& decoder); + LLaMAContextDecoder(LLaMAContextDecoder const& decoder); - ~GptNeoXContextDecoder(); + ~LLaMAContextDecoder(); void forward(std::vector* output_tensors, const std::vector* input_tensors, - const std::vector*>* decoder_layer_weights); + const std::vector*>* decoder_layer_weights); void forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const std::vector*>* gpt_decoder_layer_weight); + const std::vector*>* llama_decoder_layer_weight); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc index 7b73ba8ee..3a8fc1458 100644 --- a/src/fastertransformer/models/llama/LLaMADecoder.cc +++ b/src/fastertransformer/models/llama/LLaMADecoder.cc @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h" +#include "src/fastertransformer/models/llama/LLaMADecoder.h" #include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" #include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h" namespace fastertransformer { template -void GptNeoXDecoder::initialize() +void LLaMADecoder::initialize() { self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer(0, // max_batch_size head_num_, @@ -59,13 +59,13 @@ void GptNeoXDecoder::initialize() } template -void GptNeoXDecoder::allocateBuffer() +void LLaMADecoder::allocateBuffer() { FT_CHECK(false); } template -void GptNeoXDecoder::allocateBuffer(size_t batch_size) +void LLaMADecoder::allocateBuffer(size_t batch_size) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false)); @@ -79,7 +79,7 @@ void GptNeoXDecoder::allocateBuffer(size_t batch_size) } template -void GptNeoXDecoder::freeBuffer() +void LLaMADecoder::freeBuffer() { if (is_allocate_buffer_ == true) { allocator_->free((void**)(&decoder_normed_input_)); @@ -91,7 +91,7 @@ void GptNeoXDecoder::freeBuffer() } template -bool GptNeoXDecoder::isValidLayerParallelId(uint l) +bool LLaMADecoder::isValidLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) @@ -99,28 +99,28 @@ bool GptNeoXDecoder::isValidLayerParallelId(uint l) } template -bool GptNeoXDecoder::isFirstLayerParallelId(uint l) +bool LLaMADecoder::isFirstLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); } template -bool GptNeoXDecoder::isLastLayerParallelId(uint l) +bool LLaMADecoder::isLastLayerParallelId(uint l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); } template -int GptNeoXDecoder::getFirstLayerParallelId() +int LLaMADecoder::getFirstLayerParallelId() { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); return local_num_layer * pipeline_para_.rank_; } template -GptNeoXDecoder::GptNeoXDecoder(size_t head_num, +LLaMADecoder::LLaMADecoder(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -155,7 +155,7 @@ GptNeoXDecoder::GptNeoXDecoder(size_t head_num, } template -GptNeoXDecoder::GptNeoXDecoder(GptNeoXDecoder const& decoder): +LLaMADecoder::LLaMADecoder(LLaMADecoder const& decoder): BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), head_num_(decoder.head_num_), size_per_head_(decoder.size_per_head_), @@ -175,7 +175,7 @@ GptNeoXDecoder::GptNeoXDecoder(GptNeoXDecoder const& decoder): } template -GptNeoXDecoder::~GptNeoXDecoder() +LLaMADecoder::~LLaMADecoder() { delete self_attention_layer_; delete ffn_layer_; @@ -183,17 +183,17 @@ GptNeoXDecoder::~GptNeoXDecoder() } template -void GptNeoXDecoder::forward(std::vector* output_tensors, +void LLaMADecoder::forward(std::vector* output_tensors, const std::vector* input_tensors, - const std::vector*>* gpt_decoder_layer_weight) + const std::vector*>* llama_decoder_layer_weight) { FT_CHECK(false); } template -void GptNeoXDecoder::forward(std::unordered_map* output_tensors, +void LLaMADecoder::forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const std::vector*>* gpt_decoder_layer_weight) + const std::vector*>* llama_decoder_layer_weight) { // input tensors: // decoder_input [local_batch_size, hidden_dimension], @@ -263,8 +263,8 @@ void GptNeoXDecoder::forward(std::unordered_map* invokeGeneralLayerNorm(decoder_normed_input_, layer_input, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, layernorm_eps_, local_batch_size, hidden_units_, @@ -294,12 +294,12 @@ void GptNeoXDecoder::forward(std::unordered_map* self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, - &gpt_decoder_layer_weight->at(l)->self_attention_weights); + &llama_decoder_layer_weight->at(l)->self_attention_weights); if (use_gptj_residual_) { invokeGeneralLayerNorm(decoder_normed_input_, layer_input, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, layernorm_eps_, local_batch_size, hidden_units_, @@ -313,9 +313,9 @@ void GptNeoXDecoder::forward(std::unordered_map* decoder_normed_input_, self_attn_output_, layer_input, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, layernorm_eps_, local_batch_size, hidden_units_, @@ -334,7 +334,7 @@ void GptNeoXDecoder::forward(std::unordered_map* data_type, {local_batch_size, hidden_units_}, use_gptj_residual_ ? ffn_output_ : layer_output}}}); - ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); if (use_gptj_residual_) { // Original workflow: @@ -346,7 +346,7 @@ void GptNeoXDecoder::forward(std::unordered_map* ffn_output_, self_attn_output_, layer_input, - gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, local_batch_size, hidden_units_, tensor_para_.world_size_, @@ -358,7 +358,7 @@ void GptNeoXDecoder::forward(std::unordered_map* else { invokeAddBiasResidual(layer_output, self_attn_output_, - gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, local_batch_size, hidden_units_, stream_); @@ -385,7 +385,7 @@ void GptNeoXDecoder::forward(std::unordered_map* } } -template class GptNeoXDecoder; -template class GptNeoXDecoder; +template class LLaMADecoder; +template class LLaMADecoder; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h index add736adc..cbbc272ff 100644 --- a/src/fastertransformer/models/llama/LLaMADecoder.h +++ b/src/fastertransformer/models/llama/LLaMADecoder.h @@ -23,7 +23,7 @@ #include "src/fastertransformer/layers/BaseLayer.h" #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" -#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" #include "src/fastertransformer/utils/Tensor.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/utils/cublasMMWrapper.h" @@ -33,7 +33,7 @@ namespace fastertransformer { template -class GptNeoXDecoder: public BaseLayer { +class LLaMADecoder: public BaseLayer { private: protected: void allocateBuffer() override; @@ -71,7 +71,7 @@ class GptNeoXDecoder: public BaseLayer { FfnLayer* ffn_layer_; public: - GptNeoXDecoder(size_t head_num, + LLaMADecoder(size_t head_num, size_t size_per_head, size_t inter_size, size_t num_layer, @@ -88,17 +88,17 @@ class GptNeoXDecoder: public BaseLayer { std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); - GptNeoXDecoder(GptNeoXDecoder const& decoder); + LLaMADecoder(LLaMADecoder const& decoder); - virtual ~GptNeoXDecoder(); + virtual ~LLaMADecoder(); virtual void forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const std::vector*>* decoder_layer_weights); + const std::vector*>* decoder_layer_weights); virtual void forward(std::vector* output_tensors, const std::vector* input_tensors, - const std::vector*>* decoder_layer_weights); + const std::vector*>* decoder_layer_weights); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc index 3d62df83d..9ed355047 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" #include "src/fastertransformer/utils/memory_utils.h" namespace fastertransformer { template -GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const int hidden_units, +LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, const int tensor_para_rank, @@ -36,7 +36,7 @@ GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const int hidden_units, } template -GptNeoXDecoderLayerWeight::~GptNeoXDecoderLayerWeight() +LLaMADecoderLayerWeight::~LLaMADecoderLayerWeight() { if (is_maintain_buffer == true) { for (int i = 0; i < 12; i++) { @@ -63,7 +63,7 @@ GptNeoXDecoderLayerWeight::~GptNeoXDecoderLayerWeight() } template -GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other): +LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other): hidden_units_(other.hidden_units_), inter_size_(other.inter_size_), tensor_para_size_(other.tensor_para_size_), @@ -90,7 +90,7 @@ GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLaye } template -GptNeoXDecoderLayerWeight& GptNeoXDecoderLayerWeight::operator=(const GptNeoXDecoderLayerWeight& other) +LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADecoderLayerWeight& other) { hidden_units_ = other.hidden_units_; inter_size_ = other.inter_size_; @@ -119,7 +119,7 @@ GptNeoXDecoderLayerWeight& GptNeoXDecoderLayerWeight::operator=(const GptN } template -void GptNeoXDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) +void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) { FT_CHECK(is_maintain_buffer == true); const std::string rank_spec = std::to_string(tensor_para_rank_); @@ -175,7 +175,7 @@ void GptNeoXDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataTyp } template -void GptNeoXDecoderLayerWeight::setWeightPtr() +void LLaMADecoderLayerWeight::setWeightPtr() { pre_layernorm_weights.beta = weights_ptr[0]; pre_layernorm_weights.gamma = weights_ptr[1]; @@ -195,7 +195,7 @@ void GptNeoXDecoderLayerWeight::setWeightPtr() } template -void GptNeoXDecoderLayerWeight::mallocWeights() +void LLaMADecoderLayerWeight::mallocWeights() { deviceMalloc(&weights_ptr[0], hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); @@ -214,7 +214,7 @@ void GptNeoXDecoderLayerWeight::mallocWeights() deviceMalloc(&weights_ptr[11], hidden_units_); } -template struct GptNeoXDecoderLayerWeight; -template struct GptNeoXDecoderLayerWeight; +template struct LLaMADecoderLayerWeight; +template struct LLaMADecoderLayerWeight; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h index 2850da466..44726f58c 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h @@ -26,17 +26,17 @@ namespace fastertransformer { template -struct GptNeoXDecoderLayerWeight { +struct LLaMADecoderLayerWeight { public: - GptNeoXDecoderLayerWeight() = default; - GptNeoXDecoderLayerWeight(const int hidden_units, + LLaMADecoderLayerWeight() = default; + LLaMADecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, const int tensor_para_rank = 0, const bool use_gptj_residual = true); - ~GptNeoXDecoderLayerWeight(); - GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other); - GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other); + ~LLaMADecoderLayerWeight(); + LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other); + LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other); void loadModel(std::string dir_path, FtCudaDataType model_file_type); diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index 26995f255..cc8c5ab25 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h" +#include "src/fastertransformer/models/llama/LLaMAWeight.h" namespace fastertransformer { template -GptNeoXWeight::GptNeoXWeight(const int hidden_units, +LLaMAWeight::LLaMAWeight(const int hidden_units, const int inter_size, const int vocab_size, const int num_layer, @@ -61,13 +61,13 @@ GptNeoXWeight::GptNeoXWeight(const int hidde decoder_layer_weights.reserve(num_layer_); for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { - decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight( + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight( hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); } else { // Layer-parallelism: allocate empty layer because // this rank does not compute it: - decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight(0, 0)); + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight(0, 0)); } } @@ -76,7 +76,7 @@ GptNeoXWeight::GptNeoXWeight(const int hidde } template -GptNeoXWeight::~GptNeoXWeight() +LLaMAWeight::~LLaMAWeight() { if (is_maintain_buffer == true) { for (int i = 0; i < weights_ptr.size(); i++) { @@ -92,7 +92,7 @@ GptNeoXWeight::~GptNeoXWeight() } template -GptNeoXWeight::GptNeoXWeight(const GptNeoXWeight& other): +LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): hidden_units_(other.hidden_units_), inter_size_(other.inter_size_), vocab_size_(other.vocab_size_), @@ -137,7 +137,7 @@ GptNeoXWeight::GptNeoXWeight(const GptNeoXWeight& other): } template -GptNeoXWeight& GptNeoXWeight::operator=(const GptNeoXWeight& other) +LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) { hidden_units_ = other.hidden_units_; inter_size_ = other.inter_size_; @@ -184,7 +184,7 @@ GptNeoXWeight& GptNeoXWeight::operator=(const GptNeoXWeight& other) } template -void GptNeoXWeight::setWeightPtr() +void LLaMAWeight::setWeightPtr() { prompt_learning_table.resize(prompt_learning_pair_.size()); @@ -207,7 +207,7 @@ void GptNeoXWeight::setWeightPtr() } template -void GptNeoXWeight::mallocWeights() +void LLaMAWeight::mallocWeights() { weights_ptr.resize(num_base_weights + prompt_learning_pair_.size()); @@ -233,9 +233,9 @@ void GptNeoXWeight::mallocWeights() } template -void GptNeoXWeight::loadModel(std::string dir_path) +void LLaMAWeight::loadModel(std::string dir_path) { - FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "gptneox"); + FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama"); FT_CHECK(is_maintain_buffer == true); loadWeightFromBin( @@ -279,24 +279,24 @@ void GptNeoXWeight::loadModel(std::string dir_path) } template -void GptNeoXWeight::resizeLayer(const int num_layer) +void LLaMAWeight::resizeLayer(const int num_layer) { num_layer_ = num_layer; decoder_layer_weights.reserve(num_layer_); for (int l = 0; l < num_layer_; l++) { - decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight()); + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight()); } } template -bool GptNeoXWeight::isValidLayerParallelId(int l) +bool LLaMAWeight::isValidLayerParallelId(int l) { int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_)); return l < num_layer_ && (l >= local_num_layer * layer_para_rank_) && (l < local_num_layer * (layer_para_rank_ + 1)); } -template struct GptNeoXWeight; -template struct GptNeoXWeight; +template struct LLaMAWeight; +template struct LLaMAWeight; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h index 3e868854e..dd602c107 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.h +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -17,17 +17,17 @@ #pragma once #include "src/fastertransformer/kernels/layernorm_kernels.h" -#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" #include "src/fastertransformer/utils/memory_utils.h" #include "src/fastertransformer/utils/prompt_learning.h" namespace fastertransformer { template -struct GptNeoXWeight { +struct LLaMAWeight { - GptNeoXWeight() = default; - GptNeoXWeight( + LLaMAWeight() = default; + LLaMAWeight( const int hidden_units, const int inter_size, const int vocab_size, @@ -41,18 +41,18 @@ struct GptNeoXWeight { PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); - ~GptNeoXWeight(); - GptNeoXWeight(const GptNeoXWeight& other); - GptNeoXWeight& operator=(const GptNeoXWeight& other); + ~LLaMAWeight(); + LLaMAWeight(const LLaMAWeight& other); + LLaMAWeight& operator=(const LLaMAWeight& other); void loadModel(std::string dir_path); void resizeLayer(const int num_layer); - std::vector*> decoder_layer_weights; + std::vector*> decoder_layer_weights; const T* pre_decoder_embedding_table = nullptr; // GPT-J does not use embedding table, but we leave the ptr such that - // GptNeoX::forward and Gpt::forward become identical + // LLaMA::forward and Gpt::forward become identical const T* position_encoding_table = nullptr; /* From a590e948eca2169b98008f9f1837b729838a295a Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 12 Sep 2023 08:31:13 +0000 Subject: [PATCH 04/55] dump --- examples/cpp/CMakeLists.txt | 1 + src/fastertransformer/models/llama/LLaMA.cc | 171 ++---------------- src/fastertransformer/models/llama/LLaMA.h | 16 -- .../models/llama/LLaMAWeight.cc | 103 +---------- .../models/llama/LLaMAWeight.h | 16 +- src/fastertransformer/th_op/llama/LLaMA.cc | 8 +- src/fastertransformer/th_op/llama/LLaMA.h | 44 ++--- 7 files changed, 52 insertions(+), 307 deletions(-) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index da24d72c6..38ae86412 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -26,6 +26,7 @@ add_subdirectory(wenet) add_subdirectory(gptj) add_subdirectory(gptneox) +add_subdirectory(llama) add_subdirectory(multi_gpu_gpt) if(ENABLE_FP8) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 4ac26a473..575636fb4 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -214,16 +214,8 @@ LLaMA::LLaMA(size_t head_num, size_t rotary_embedding_dim, int start_id, int end_id, - int prompt_learning_start_id, // only needed by p/prompt-tuning - PromptLearningType prompt_learning_type, bool use_gptj_residual, - float beam_search_diversity_rate, - size_t top_k, - float top_p, unsigned long long random_seed, - float temperature, - float len_penalty, - float repetition_penalty, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, @@ -241,8 +233,6 @@ LLaMA::LLaMA(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), start_id_(start_id), end_id_(end_id), - prompt_learning_start_id_(prompt_learning_start_id), - prompt_learning_type_(prompt_learning_type), use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), @@ -270,16 +260,8 @@ LLaMA::LLaMA(size_t head_num, size_t rotary_embedding_dim, int start_id, int end_id, - int prompt_learning_start_id, // only needed by p/prompt-tuning - PromptLearningType prompt_learning_type, bool use_gptj_residual, - float beam_search_diversity_rate, - size_t top_k, - float top_p, unsigned long long random_seed, - float temperature, - float len_penalty, - float repetition_penalty, NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, @@ -299,8 +281,6 @@ LLaMA::LLaMA(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), start_id_(start_id), end_id_(end_id), - prompt_learning_start_id_(prompt_learning_start_id), - prompt_learning_type_(prompt_learning_type), use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), tensor_para_(tensor_para), @@ -330,7 +310,6 @@ LLaMA::LLaMA(LLaMA const& llama): start_id_(llama.start_id_), end_id_(llama.end_id_), prompt_learning_start_id_(llama.prompt_learning_start_id_), - prompt_learning_type_(llama.prompt_learning_type_), use_gptj_residual_(llama.use_gptj_residual_), hidden_units_(llama.hidden_units_), tensor_para_(llama.tensor_para_), @@ -383,26 +362,13 @@ void LLaMA::forward(std::unordered_map* output_ten // input_tensors: // input_ids [batch_size, max_input_length] // input_lengths [batch_size] - // prompt_learning_task_name_ids [batch_size] on cpu, optional // output_seq_len [batch_size] on cpu // start_id [batch_size] on cpu, optional // end_id [batch_size] on cpu, optional // stop_words_list [batch_size, 2, stop_words_length], optional // bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional - // runtime_top_k [1] or [batch_size] on cpu, optional, uint. - // runtime_top_p [1] or [batch_size] on cpu, optional, float. - // beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float. - // temperature [1] or [batch_size] on cpu, optional, float. - // len_penalty [1] or [batch_size] on cpu, optional, float. - // repetition_penalty [1] or [batch_size] on cpu, optional, float. // min_length [1] or [batch_size] on cpu, optional, int // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. - // request_prompt_lengths [batch_size], optional - // request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional - // requst_prompt_type [batch_size], int, optional - // top_p_decay [batch_size] on gpu, float, optional - // top_p_min [batch_size] on gpu, float, optional - // top_p_reset_ids [batch_size] on gpu, uint32, optional // output_tensors: // output_ids [batch_size, beam_width, max_output_seq_len] @@ -432,83 +398,28 @@ void LLaMA::forward(std::unordered_map* output_ten const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; - PromptLearningType request_prompt_type = PromptLearningType::no_prompt; - int valid_prompt_inputs = input_tensors->count("request_prompt_type") - + input_tensors->count("request_prompt_lengths") - + input_tensors->count("request_prompt_embedding"); - - if (valid_prompt_inputs == 3) { - request_prompt_type = static_cast(input_tensors->at("request_prompt_type").getVal()); - FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids"); - } - else if (valid_prompt_inputs > 0) { - FT_LOG_WARNING( - "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!"); - } - if (request_prompt_type == PromptLearningType::prefix_prompt) { - FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!"); - } // Prefix Prompt Inputs // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes // pad) // TODO (perkzz): move unnecessary paddings - const int* prompt_learning_task_name_ids = - input_tensors->count("prompt_learning_task_name_ids") ? - input_tensors->at("prompt_learning_task_name_ids").getPtr() : - nullptr; - has_prefix_prompt_ = - (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt); + has_prefix_prompt_ = false; int max_prefix_prompt_length = 0; - FT_CHECK_WITH_INFO( - !(prompt_learning_task_name_ids != nullptr - && (prompt_learning_type_ == PromptLearningType::no_prompt - || prompt_learning_type_ == PromptLearningType::soft_prompt)), - "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided."); - // NOTE: Prefix Prompt PreProcessing // get prefix_prompt_weight for each batch --> shape [batch, beam_width] // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] std::vector prefix_prompt_weight_batch_ptrs; std::vector prefix_prompt_lengths; - if (has_prefix_prompt_) { - for (int bs_id = 0; bs_id < batch_size; ++bs_id) { - int task_id = prompt_learning_task_name_ids[bs_id]; - // throw errors when prompt task_name_ids are not found - std::pair prefix_prompt_weight_length_pair; - try { - prefix_prompt_weight_length_pair = llama_weights->prompt_learning_table.at(task_id); - } - catch (const std::out_of_range& oor) { - FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id); - throw oor; - } - for (int bw_id = 0; bw_id < beam_width; ++bw_id) { - prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first); - prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second); - } - } - - max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end()); - - FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length); - - if (max_prefix_prompt_length == 0) { - has_prefix_prompt_ = false; - FT_LOG_DEBUG("prompts are not applied !"); - } - } int max_input_length = input_tensors->at("input_ids").shape[1]; FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0), "Prefix Prompt should come with inputs!"); // Prefix Soft Prompt - has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt; - const size_t max_prefix_soft_prompt_length = - has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; - const size_t limit_len_offset = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0); + has_prefix_soft_prompt_ = false; + const size_t max_prefix_soft_prompt_length = 0; + const size_t limit_len_offset = 0 + (max_input_length == 0 ? 1 : 0); const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; const size_t max_seq_len = max_output_seq_len; // max cache seq len should include max prefix prompt length as it has k/v states @@ -527,7 +438,7 @@ void LLaMA::forward(std::unordered_map* output_ten } const cudaDataType_t gemm_data_type = getCudaDataType(); allocateBuffer( - batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length); + batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + 0); setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); sync_check_cuda_error(); @@ -562,23 +473,11 @@ void LLaMA::forward(std::unordered_map* output_ten } // Prefix prompts - if (has_prefix_prompt_) { - cudaMemcpyAsync(prompt_learning_weight_batch_, - prefix_prompt_weight_batch_ptrs.data(), - sizeof(T*) * batch_size * beam_width, - cudaMemcpyDefault, - stream_); - cudaMemcpyAsync(tiled_prompt_lengths_buf_, - prefix_prompt_lengths.data(), - sizeof(int) * batch_size * beam_width, - cudaMemcpyDefault, - stream_); - } sync_check_cuda_error(); // handle first step - if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) { + if (max_input_length > 1) { invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, input_tensors->at("input_ids").getPtr(), @@ -589,43 +488,19 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); - if (has_prefix_soft_prompt_) { - inputIdsEmbeddingLookupPosEncodingSoftPromptParam param; - param.from_tensor = context_decoder_input_buf_; - param.output_ids = output_ids_buf_; - param.input_lengths = tiled_input_lengths_buf_; - param.embedding_table = llama_weights->pre_decoder_embedding_table; - param.pos_table = llama_weights->position_encoding_table; - param.prefix_soft_prompt_embedding = input_tensors->at("request_prompt_embedding").getPtr(); - param.prefix_soft_prompt_lengths = input_tensors->at("request_prompt_lengths").getPtr(); - param.input_ids = tiled_input_ids_buf_; - param.start_step = 1; - param.max_input_length = max_input_length; - param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; - param.batch_size = batch_size; - param.beam_width = beam_width; - param.hidden_units = hidden_units_; - param.stream = stream_; - - invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param); - sync_check_cuda_error(); - max_input_length += max_prefix_soft_prompt_length; // view soft_prompt as input - } - else { - invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - output_ids_buf_, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - pPromptTuningParam{}, // no p/prompt tuning - tiled_input_ids_buf_, - 1, - max_input_length, - max_input_length, - batch_size * beam_width, - hidden_units_, - stream_); - sync_check_cuda_error(); - } + invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, + output_ids_buf_, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size * beam_width, + hidden_units_, + stream_); + sync_check_cuda_error(); invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, @@ -688,8 +563,6 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); } else if (max_input_length == 0) { - FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt - && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case max_input_length++; invokeDecodingInitialize(finished_buf_, sequence_lengths_, @@ -709,8 +582,6 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); } else if (max_input_length == 1) { - FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt - && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -1093,8 +964,6 @@ void LLaMA::setOutputTensors(std::unordered_map* o const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); - const size_t max_prefix_soft_prompt_length = - has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; if (input_tensors->at("input_ids").shape[1] == 0) { invokeCudaD2DcpyConvert( @@ -1149,7 +1018,7 @@ void LLaMA::setOutputTensors(std::unordered_map* o param.prefix_soft_prompt_lengths = has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr() : nullptr; param.input_lengths = tiled_input_lengths_buf_; - param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; + param.max_prefix_soft_prompt_length = 0; param.max_input_without_prompt_length = max_input_length; param.stream = stream_; param.output_ids = output_tensors->at("output_ids").getPtr(); diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 5cf7b0025..48506f529 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -145,16 +145,8 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim, int start_id, int end_id, - int prompt_learning_start_id, // only needed by p/prompt-tuning - PromptLearningType prompt_learning_type, bool use_gptj_residual, - float beam_search_diversity_rate, - size_t top_k, - float top_p, unsigned long long random_seed, - float temperature, - float len_penalty, - float repetition_penalty, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, @@ -172,16 +164,8 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim, int start_id, int end_id, - int prompt_learning_start_id, // only needed by p/prompt-tuning - PromptLearningType prompt_learning_type, bool use_gptj_residual, - float beam_search_diversity_rate, - size_t top_k, - float top_p, unsigned long long random_seed, - float temperature, - float len_penalty, - float repetition_penalty, NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index cc8c5ab25..dddf6eff6 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -28,9 +28,7 @@ LLaMAWeight::LLaMAWeight(const int hidden_un const int tensor_para_rank, const int layer_para_size, const int layer_para_rank, - const bool use_gptj_residual, - PromptLearningType prompt_learning_type, - std::map> prompt_learning_pair): + const bool use_gptj_residual): hidden_units_(hidden_units), inter_size_(inter_size), vocab_size_(vocab_size), @@ -40,23 +38,10 @@ LLaMAWeight::LLaMAWeight(const int hidden_un tensor_para_rank_(tensor_para_rank), layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank), - use_gptj_residual_(use_gptj_residual), - prompt_learning_type_(prompt_learning_type), - prompt_learning_pair_(prompt_learning_pair) + use_gptj_residual_(use_gptj_residual) { FT_CHECK(num_layer_ % layer_para_size_ == 0); - // set prompt weight size - if (prompt_learning_type_ == PromptLearningType::prefix_prompt) { - prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_; - } - else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) { - prompt_token_weight_size_ = hidden_units_; - } - // set if load and malloc prompt weights - malloc_load_prompt_weights_ = !prompt_learning_pair_.empty() - && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning - || prompt_learning_type_ == PromptLearningType::prefix_prompt); decoder_layer_weights.reserve(num_layer_); for (int l = 0; l < num_layer_; l++) { @@ -103,10 +88,7 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), use_gptj_residual_(other.use_gptj_residual_), - prompt_token_weight_size_(other.prompt_token_weight_size_), - malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), - prompt_learning_type_(other.prompt_learning_type_), - prompt_learning_pair_(other.prompt_learning_pair_) + prompt_token_weight_size_(other.prompt_token_weight_size_) { mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); @@ -115,18 +97,6 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); // prompt learning table: malloc weights and set weight ptr - if (malloc_load_prompt_weights_) { - for (auto const& prompt : prompt_learning_pair_) { - std::string task_name = prompt.first; - int task_name_id = prompt.second.first; - int prompt_length = prompt.second.second; - size_t prompt_id = num_base_weights + (size_t)task_name_id; - - // cuda device to device memcpy prompt table weights buffer memory - cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); - } - } - setWeightPtr(); decoder_layer_weights.clear(); @@ -150,9 +120,6 @@ LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) layer_para_rank_ = other.layer_para_rank_; use_gptj_residual_ = other.use_gptj_residual_; prompt_token_weight_size_ = other.prompt_token_weight_size_; - malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; - prompt_learning_type_ = other.prompt_learning_type_; - prompt_learning_pair_ = other.prompt_learning_pair_; mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); @@ -160,19 +127,6 @@ LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); - // prompt learning table: malloc weights and set weight ptr - if (malloc_load_prompt_weights_) { - for (auto const& prompt : prompt_learning_pair_) { - std::string task_name = prompt.first; - int task_name_id = prompt.second.first; - int prompt_length = prompt.second.second; - size_t prompt_id = num_base_weights + (size_t)task_name_id; - - // cuda device to device memcpy prompt table weights buffer memory - cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); - } - } - setWeightPtr(); decoder_layer_weights.clear(); @@ -186,49 +140,22 @@ LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) template void LLaMAWeight::setWeightPtr() { - prompt_learning_table.resize(prompt_learning_pair_.size()); - pre_decoder_embedding_table = weights_ptr[0]; post_decoder_layernorm.beta = weights_ptr[1]; post_decoder_layernorm.gamma = weights_ptr[2]; post_decoder_embedding.kernel = weights_ptr[3]; - - // prompt learning tables: set weight ptr - if (malloc_load_prompt_weights_) { - for (auto const& prompt : prompt_learning_pair_) { - int task_name_id = prompt.second.first; - int prompt_length = prompt.second.second; - size_t task_weight_id = num_base_weights + (size_t)task_name_id; - - // set weight ptr - prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length}; - } - } } template void LLaMAWeight::mallocWeights() { - weights_ptr.resize(num_base_weights + prompt_learning_pair_.size()); + weights_ptr.resize(num_base_weights); deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); deviceMalloc(&weights_ptr[2], hidden_units_); deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); - // prompt learning tables: malloc weights - if (malloc_load_prompt_weights_) { - for (auto const& prompt : prompt_learning_pair_) { - int task_name_id = prompt.second.first; - int prompt_length = prompt.second.second; - size_t task_weight_id = num_base_weights + (size_t)task_name_id; - - // malloc weights - T* prompt_weights_ptr = nullptr; - deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_); - weights_ptr[task_weight_id] = prompt_weights_ptr; - } - } is_maintain_buffer = true; } @@ -249,28 +176,6 @@ void LLaMAWeight::loadModel(std::string dir_path) dir_path + "/model.lm_head.weight.bin", model_file_type); - // prompt table: load weights from bin - if (malloc_load_prompt_weights_) { - for (auto const& prompt : prompt_learning_pair_) { - std::string task_name = prompt.first; - int task_name_id = prompt.second.first; - int prompt_length = prompt.second.second; - size_t task_weight_id = num_base_weights + (size_t)task_name_id; - - std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ? - (dir_path + "/model.prompt_table." + task_name + ".weight.bin") : - (dir_path + "/model.prefix_prompt." + task_name + ".weight." - + std::to_string(tensor_para_rank_) + ".bin"); - - if (prompt_length > 0) { - loadWeightFromBin(weights_ptr[task_weight_id], - {(size_t)(prompt_length * (int)prompt_token_weight_size_)}, - prompt_weight_path_name, - model_file_type); - } - } - } - for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type); diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h index dd602c107..5f3c071e6 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.h +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -37,9 +37,7 @@ struct LLaMAWeight { const int tensor_para_rank = 0, const int layer_para_size = 1, const int layer_para_rank = 0, - const bool use_gptj_residual_ = true, - PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, - std::map> prompt_learning_pair = std::map>{}); + const bool use_gptj_residual_ = true); ~LLaMAWeight(); LLaMAWeight(const LLaMAWeight& other); @@ -55,15 +53,6 @@ struct LLaMAWeight { // LLaMA::forward and Gpt::forward become identical const T* position_encoding_table = nullptr; - /* - prompt_learning_pair = vectors of [weight ptr, prompt length] pair - prompt_length is stored here for compatible prompt learning table - prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] - p/prompt tuning weights store as shape [prompt_len, hidden_units] - idx is the task_name_id of the prompt tables - */ - std::vector> prompt_learning_table = {}; - LayerNormWeight post_decoder_layernorm; DenseWeight post_decoder_embedding; @@ -92,9 +81,6 @@ struct LLaMAWeight { bool use_gptj_residual_; // prompt learning pair (task_name, (task_name_id, prompt_len)) - PromptLearningType prompt_learning_type_; - std::map> prompt_learning_pair_; - bool malloc_load_prompt_weights_ = false; // each prompt token's weight size size_t prompt_token_weight_size_ = 0; diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index e913570cd..08449b679 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -41,7 +41,7 @@ LLaMA::LLaMA(const int64_t head_num, switch (st_) { case at::ScalarType::Float: - ftgpt = new FTGptNeoX((size_t)head_num, + ftllama = new FTLLaMA((size_t)head_num, (size_t)size_per_head, (size_t)inter_size, (size_t)layer_num, @@ -56,7 +56,7 @@ LLaMA::LLaMA(const int64_t head_num, weights); break; case at::ScalarType::Half: - ftgpt = new FTGptNeoX((size_t)head_num, + ftllama = new FTLLaMA((size_t)head_num, (size_t)size_per_head, (size_t)inter_size, (size_t)layer_num, @@ -77,7 +77,7 @@ LLaMA::LLaMA(const int64_t head_num, LLaMA::~LLaMA() { - delete ftgpt; + delete ftllama; } std::vector LLaMA::forward(th::Tensor input_ids, @@ -119,7 +119,7 @@ std::vector LLaMA::forward(th::Tensor input_ids, th::Tensor cum_log_probs = torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); - ftgpt->forward(input_ids, + ftllama->forward(input_ids, input_lengths, output_ids, sequence_lengths, diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 3cca0bb19..1aac8a7d7 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -81,40 +81,40 @@ class FTLLaMA: public IFLLaMA { ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size); - gpt_weights_.resizeLayer(layer_num_); + llama_weights_.resizeLayer(layer_num_); for (int i = 0; i < (int)layer_num_; i++) { - gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = + llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = get_ptr(weights_[i + 0 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = + llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = get_ptr(weights_[i + 1 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = + llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = get_ptr(weights_[i + 2 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = + llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = get_ptr(weights_[i + 3 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = + llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = get_ptr(weights_[i + 4 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = + llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = get_ptr(weights_[i + 5 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = get_ptr(weights_[i + 6 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = get_ptr(weights_[i + 7 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = + llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = get_ptr(weights_[i + 8 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = + llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = get_ptr(weights_[i + 9 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = + llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = get_ptr(weights_[i + 10 * layer_num_]); - gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = + llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = get_ptr(weights_[i + 11 * layer_num_]); } - gpt_weights_.pre_decoder_embedding_table = get_ptr(weights_[12 * layer_num_ + 0]); - gpt_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[12 * layer_num_ + 1]); - gpt_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); - gpt_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); + llama_weights_.pre_decoder_embedding_table = get_ptr(weights_[12 * layer_num_ + 0]); + llama_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[12 * layer_num_ + 1]); + llama_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); + llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); - gpt_weights_.setMaxSeqLen(max_seq_len); + llama_weights_.setMaxSeqLen(max_seq_len); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); } @@ -172,7 +172,7 @@ class FTLLaMA: public IFLLaMA { false, // with_relative_position_bias true); // causal_mask - ft::LLaMA gpt = ft::LLaMA(head_num_, + ft::LLaMA llama = ft::LLaMA(head_num_, size_per_head_, inter_size_, layer_num_, @@ -267,7 +267,7 @@ class FTLLaMA: public IFLLaMA { } try { - gpt.forward(&output_tensors, &input_tensors, &gpt_weights_); + llama.forward(&output_tensors, &input_tensors, &llama_weights_); } catch (std::runtime_error& error) { std::cout << error.what(); @@ -297,7 +297,7 @@ class FTLLaMA: public IFLLaMA { std::mutex* cublas_wrapper_mutex_; ft::cublasAlgoMap* cublas_algo_map_; struct cudaDeviceProp prop_; - ft::LLaMAWeight gpt_weights_; + ft::LLaMAWeight llama_weights_; ft::NcclParam tensor_para_; ft::NcclParam pipeline_para_; @@ -339,7 +339,7 @@ class LLaMA: public th::jit::CustomClassHolder { private: const at::ScalarType st_; - IFLLaMA* ftgpt; + IFLLaMA* ftllama; std::vector weights; }; From a763d188fc202c2b0e0c64a8d9ec72d279815a3d Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 12 Sep 2023 11:46:03 +0000 Subject: [PATCH 05/55] add examples --- examples/cpp/llama/CMakeLists.txt | 18 ++ examples/cpp/llama/bad_words.csv | 2 + examples/cpp/llama/llama_config.ini | 23 ++ examples/cpp/llama/llama_example.cc | 403 ++++++++++++++++++++++++++++ examples/cpp/llama/start_ids.csv | 8 + examples/cpp/llama/stop_words.csv | 2 + 6 files changed, 456 insertions(+) create mode 100644 examples/cpp/llama/CMakeLists.txt create mode 100644 examples/cpp/llama/bad_words.csv create mode 100644 examples/cpp/llama/llama_config.ini create mode 100644 examples/cpp/llama/llama_example.cc create mode 100644 examples/cpp/llama/start_ids.csv create mode 100644 examples/cpp/llama/stop_words.csv diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt new file mode 100644 index 000000000..ce0bee75f --- /dev/null +++ b/examples/cpp/llama/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(llama_example llama_example.cc) +target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart + LLaMA mpi_utils nccl_utils nvtx_utils + gpt_example_utils word_list) diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv new file mode 100644 index 000000000..6a1126ebd --- /dev/null +++ b/examples/cpp/llama/bad_words.csv @@ -0,0 +1,2 @@ +7768,3908 +1,2 diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini new file mode 100644 index 000000000..58874bdc2 --- /dev/null +++ b/examples/cpp/llama/llama_config.ini @@ -0,0 +1,23 @@ +[ft_instance_hyperparameter] +model_name=llama_33B +model_dir=../models/llam +data_type=fp16 +pipeline_para_size=4 + + +[request] +beam_width=1 # beam width for beam search +request_batch_size=8 # determine by the request +request_output_len=0 # determine by the request + +[llama_33B] +head_num=52 +size_per_head=128 +vocab_size=32000 +decoder_layers=60 +rotary_embedding=128 +multiple_of=256 +start_id=0 +end_id=2 + +use_gptj_residual=1 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc new file mode 100644 index 000000000..699e39154 --- /dev/null +++ b/examples/cpp/llama/llama_example.cc @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMA.h" +#include "src/fastertransformer/utils/mpi_utils.h" +#include "src/fastertransformer/utils/nccl_utils.h" +#include "src/fastertransformer/utils/nvtx_utils.h" +#include "src/fastertransformer/utils/word_list.h" +#include "3rdparty/INIReader.h" + +// Remove LATER +#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" + +#include +#include +#include +#include +#include +#include + +using namespace fastertransformer; + +template +void llama_example(const INIReader reader); + +int main(int argc, char* argv[]) +{ + mpi::initialize(&argc, &argv); + srand(0); + + std::string ini_name; + if (argc == 2) { + ini_name = std::string(argv[1]); + } + else { + ini_name = "../examples/cpp/llama/llama_config.ini"; + } + + INIReader reader = INIReader(ini_name); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; + return -1; + } + const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type"); + + if (data_type == "fp32") { + llama_example(reader); + } + else if (data_type == "fp16") { + llama_example(reader); + } + else { + FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); + return -1; + } + mpi::finalize(); + return 0; +} + +template +void llama_example(const INIReader reader) +{ + const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); + std::string model_dir = std::string(reader.Get("ft_instance_hyperparameter", "model_dir")); + int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + + const size_t head_num = reader.GetInteger(model_name, "head_num"); + const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); + const size_t vocab_size = reader.GetInteger(model_name, "vocab_size"); + const size_t decoder_layers = reader.GetInteger(model_name, "decoder_layers"); + const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); + const int multiple_of = reader.GetInteger(model_name, "multiple_of"); + const int start_id = reader.GetInteger(model_name, "start_id"); + const int end_id = reader.GetInteger(model_name, "end_id"); + + const size_t hidden_units = head_num * size_per_head; + const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of -1) / multiple_of); + + const size_t beam_width = reader.GetInteger("request", "beam_width"); + const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + const int request_output_len = reader.GetInteger("request", "request_output_len"); + const int min_length = reader.GetInteger("request", "min_length", 0); + + FT_CHECK(decoder_layers % pipeline_para_size == 0); + + // Prepare the parallelism parameters + int rank = mpi::getCommWorldRank(); + int world_size = mpi::getCommWorldSize(); + if (rank == 0) { + printf("Total ranks: %d.\n", world_size); + } + int device, device_count; + check_cuda_error(cudaGetDeviceCount(&device_count)); + check_cuda_error(cudaSetDevice(rank % device_count)); + check_cuda_error(cudaGetDevice(&device)); + + struct cudaDeviceProp prop; + check_cuda_error(cudaGetDeviceProperties(&prop, device)); + printf("Device %s\n", prop.name); + + printf("P%d is running with GPU #%d.\n", rank, device); + if (pipeline_para_size != world_size) { + printf("[ERROR] pipeline_para_size should equal to world_size \n"); + exit(-1); + } + + const int layers_per_group = decoder_layers / pipeline_para_size; + if (layers_per_group * pipeline_para_size != (int)decoder_layers) { + printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n", + layers_per_group, + pipeline_para_size, + decoder_layers); + exit(-1); + } + + NcclParam tensor_para; + NcclParam pipeline_para; + ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size); + + // Handle bad_words dictionary + std::vector bad_words; + read_word_list("../examples/cpp/llama/bad_words.csv", bad_words); + + int* d_bad_words = nullptr; + deviceMalloc(&d_bad_words, bad_words.size(), false); + cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size()); + + // Handle stop_words dictionary + std::vector stop_words; + read_word_list("../examples/cpp/llama/stop_words.csv", stop_words); + + const size_t stop_words_len = stop_words.size() / 2; + // Tile with same dict for each element + std::vector tiled_stop_words; + for (int i = 0; i < request_batch_size; i++) { + tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end()); + } + + int* d_stop_words = nullptr; + deviceMalloc(&d_stop_words, tiled_stop_words.size(), false); + cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size()); + + // Read ids of request from file. + size_t max_input_len = -1; + std::vector v_start_lengths; + std::vector v_start_ids; + read_start_ids(request_batch_size, + &v_start_lengths, + &v_start_ids, + max_input_len, + end_id, + 1, + "../examples/cpp/llama/start_ids.csv"); + + int* d_input_ids; + int* d_input_lengths; + if (max_input_len == 0) { + // unconditional case, no input ids, so do nothing. + d_input_ids = nullptr; + d_input_lengths = nullptr; + } + else { + // conditional case. + deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false); + deviceMalloc(&d_input_lengths, request_batch_size, false); + cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); + cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); + } + std::vector start_ids(request_batch_size, start_id); + std::vector end_ids(request_batch_size, end_id); + + const int total_output_len = max_input_len + request_output_len; + + cudaStream_t stream; + cublasHandle_t cublas_handle; + cublasLtHandle_t cublaslt_handle; + cudaStreamCreate(&stream); + cublasCreate(&cublas_handle); + cublasLtCreate(&cublaslt_handle); + cublasSetStream(cublas_handle, stream); + cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in"); + + Allocator allocator(getDevice()); + + std::mutex* cublas_wrapper_mutex = new std::mutex(); + cublasMMWrapper cublas_wrapper = + cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator); + if (std::is_same::value) { + cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper.setFP32GemmConfig(); + } + + // LLAMA Residual Type + const bool use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1); + fastertransformer::LLaMAWeight llama_weights(hidden_units, + inter_size, + vocab_size, + decoder_layers, + 0, // max_seq_len, deprecated + tensor_para.world_size_, + tensor_para.rank_, + pipeline_para.world_size_, + pipeline_para.rank_, + use_gptj_residual); + + model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; + llama_weights.loadModel(model_dir); + unsigned long long random_seed; + if (rank == 0) { + random_seed = (unsigned long long)(0); + } + if (world_size > 1) { + mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD); + } + + AttentionType attention_type = getAttentionType(size_per_head, + getSMVersion(), + true, // remove_padding + 0, // llama supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + LLaMA llama = LLaMA(head_num, + size_per_head, + inter_size, + decoder_layers, + vocab_size, + rotary_embedding_dim, + start_id, + end_id, + use_gptj_residual, + random_seed, + tensor_para, + pipeline_para, + stream, + &cublas_wrapper, + &allocator, + false, + &prop, + attention_type); + + int* d_output_ids; + int* d_sequence_lengths; + deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); + deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); + std::vector output_seq_len(request_batch_size, total_output_len); + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, + {"output_seq_len", + Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, + {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}}, + {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}}, + {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, + {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, + {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; + + + input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + + std::unordered_map output_tensors = std::unordered_map{ + {"output_ids", + Tensor{MEMORY_GPU, + TYPE_INT32, + std::vector{request_batch_size, beam_width, (size_t)total_output_len}, + d_output_ids}}, + {"sequence_length", + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths}}, + {"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + std::vector{(size_t)request_output_len, request_batch_size, beam_width}, + nullptr}}}; + + print_mem_usage(); + + int ite = 1; + cudaDeviceSynchronize(); + mpi::barrier(); + + cudaProfilerStart(); + // warm up + ite = 1; + ft_nvtx::setScope("warmup_time"); + PUSH_RANGE("warmup time") + for (int i = 0; i < ite; ++i) { + llama.forward(&output_tensors, &input_tensors, &llama_weights); + } + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + + if (rank == 0) { + + std::string fName = "out"; + auto outFile = std::ofstream(fName, std::ios::out); + if (!outFile.is_open()) { + printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); + } + else { + size_t outCount = total_output_len * request_batch_size * beam_width; + int* hBuf = new int[outCount]; + cudaD2Hcpy(hBuf, d_output_ids, outCount); + + { + std::cout << "Writing " << outCount << " elements\n"; + int zeroCount = 0; + for (size_t i = 0; i < outCount; i++) { + if (hBuf[i] == int(0)) { + zeroCount++; + } + outFile << hBuf[i] << " "; + if ((i + 1) % (total_output_len) == 0) { + outFile << std::endl; + } + + if (i < 10) { + printf("%5d ", hBuf[i]); + } + if ((i + 1) % (total_output_len) == 0 && i < 10) { + std::cout << std::endl; + } + } + std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; + } + delete[] hBuf; + } + } + + // test time + struct timeval start, end; + mpi::barrier(); + cudaDeviceSynchronize(); + gettimeofday(&start, NULL); + + ft_nvtx::setScope("total_time"); + PUSH_RANGE("total time") + for (int i = 0; i < ite; ++i) { + llama.forward(&output_tensors, &input_tensors, &llama_weights); + } + + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + gettimeofday(&end, NULL); + + cudaProfilerStop(); + + printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d" + " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n", + request_batch_size, + beam_width, + head_num, + size_per_head, + total_output_len, + decoder_layers, + vocab_size, + ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); + + ftNcclParamDestroy(tensor_para); + ftNcclParamDestroy(pipeline_para); + + delete cublas_algo_map; + delete cublas_wrapper_mutex; + + cudaFree(d_bad_words); + cudaFree(d_stop_words); + if (d_input_ids != nullptr) { + cudaFree(d_input_ids); + } + if (d_input_lengths != nullptr) { + cudaFree(d_input_lengths); + } + if (d_output_ids != nullptr) { + deviceFree(d_output_ids); + } + if (d_sequence_lengths != nullptr) { + deviceFree(d_sequence_lengths); + } + + return; +} diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv new file mode 100644 index 000000000..88e742f39 --- /dev/null +++ b/examples/cpp/llama/start_ids.csv @@ -0,0 +1,8 @@ +688, 253, 1390, 4564, 273, 1897, 13, 247 +510, 1457, 8911, 4487, 273, 26593, 310, 6600 +510, 1457, 2816, 28260, 452, 247, 747, 1481 +510, 1457, 2816, 7717, 556, 3863, 697, 7970 +688, 247, 2118, 326, 588, 2779, 1056, 352 +510, 1457, 2816, 28260, 8, 13413, 19169, 14745 +510, 9462, 5687, 556, 38350, 26212, 253, 747 +510, 806, 673, 309, 3047, 253, 6440, 13 \ No newline at end of file diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv new file mode 100644 index 000000000..9b9b09eba --- /dev/null +++ b/examples/cpp/llama/stop_words.csv @@ -0,0 +1,2 @@ +287, 4346, 12 +3, -1, -1 From 2cb06f11fea98e0bb8dd61ee889870c84b8d966d Mon Sep 17 00:00:00 2001 From: dypshong Date: Wed, 13 Sep 2023 13:51:34 +0000 Subject: [PATCH 06/55] llama...... --- .gitignore | 5 +- examples/cpp/llama/llama_config.ini | 4 +- examples/cpp/llama/llama_example.cc | 7 +- src/fastertransformer/models/llama/LLaMA.cc | 77 +++------- src/fastertransformer/models/llama/LLaMA.h | 5 - .../models/llama/LLaMAContextDecoder.cc | 140 ++++++----------- .../models/llama/LLaMAContextDecoder.h | 4 - .../models/llama/LLaMADecoder.cc | 142 ++++++------------ .../models/llama/LLaMADecoder.h | 4 - .../models/llama/LLaMADecoderLayerWeight.cc | 39 ++--- .../models/llama/LLaMADecoderLayerWeight.h | 4 +- .../models/llama/LLaMAWeight.cc | 16 +- .../models/llama/LLaMAWeight.h | 15 +- 13 files changed, 137 insertions(+), 325 deletions(-) diff --git a/.gitignore b/.gitignore index 77849f435..5b49d9183 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,7 @@ __pycache__/ **/.ipynb_checkpoints/ /3rdparty/NeMo/ -/3rdparty/apex/ \ No newline at end of file +/3rdparty/apex/ +20B_checkpoints/ +compile_commands.json +model/ diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 58874bdc2..68f4663d1 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -1,6 +1,6 @@ [ft_instance_hyperparameter] model_name=llama_33B -model_dir=../models/llam +model_dir=../models/llama data_type=fp16 pipeline_para_size=4 @@ -19,5 +19,3 @@ rotary_embedding=128 multiple_of=256 start_id=0 end_id=2 - -use_gptj_residual=1 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 699e39154..62919d57a 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -205,18 +205,14 @@ void llama_example(const INIReader reader) cublas_wrapper.setFP32GemmConfig(); } - // LLAMA Residual Type - const bool use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1); fastertransformer::LLaMAWeight llama_weights(hidden_units, inter_size, vocab_size, decoder_layers, - 0, // max_seq_len, deprecated tensor_para.world_size_, tensor_para.rank_, pipeline_para.world_size_, - pipeline_para.rank_, - use_gptj_residual); + pipeline_para.rank_); model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; llama_weights.loadModel(model_dir); @@ -244,7 +240,6 @@ void llama_example(const INIReader reader) rotary_embedding_dim, start_id, end_id, - use_gptj_residual, random_seed, tensor_para, pipeline_para, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 575636fb4..3734b63d5 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -16,8 +16,8 @@ #include "src/fastertransformer/models/llama/LLaMA.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" -#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" +#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include @@ -32,9 +32,7 @@ void LLaMA::initialize() num_layer_, rotary_embedding_dim_, neox_rotary_style_, - use_gptj_residual_, layernorm_eps_, - tensor_para_, pipeline_para_, stream_, cublas_wrapper_, @@ -51,9 +49,7 @@ void LLaMA::initialize() num_layer_, rotary_embedding_dim_, neox_rotary_style_, - use_gptj_residual_, layernorm_eps_, - tensor_para_, pipeline_para_, stream_, cublas_wrapper_, @@ -96,20 +92,16 @@ void LLaMA::allocateBuffer( (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true)); } - input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); - decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - decoder_output_buf_ = - (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); - nccl_logits_buf_ = - (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); - cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); - finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); - h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + nccl_logits_buf_ = (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); + h_finished_buf_ = new bool[batchxbeam]; + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; @@ -214,7 +206,6 @@ LLaMA::LLaMA(size_t head_num, size_t rotary_embedding_dim, int start_id, int end_id, - bool use_gptj_residual, unsigned long long random_seed, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -233,7 +224,6 @@ LLaMA::LLaMA(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), start_id_(start_id), end_id_(end_id), - use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), attention_type_(attention_type) @@ -260,7 +250,6 @@ LLaMA::LLaMA(size_t head_num, size_t rotary_embedding_dim, int start_id, int end_id, - bool use_gptj_residual, unsigned long long random_seed, NcclParam tensor_para, NcclParam pipeline_para, @@ -281,7 +270,6 @@ LLaMA::LLaMA(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), start_id_(start_id), end_id_(end_id), - use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), tensor_para_(tensor_para), pipeline_para_(pipeline_para), @@ -310,7 +298,6 @@ LLaMA::LLaMA(LLaMA const& llama): start_id_(llama.start_id_), end_id_(llama.end_id_), prompt_learning_start_id_(llama.prompt_learning_start_id_), - use_gptj_residual_(llama.use_gptj_residual_), hidden_units_(llama.hidden_units_), tensor_para_(llama.tensor_para_), pipeline_para_(llama.pipeline_para_), @@ -403,27 +390,19 @@ void LLaMA::forward(std::unordered_map* output_ten // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes // pad) // TODO (perkzz): move unnecessary paddings - has_prefix_prompt_ = false; int max_prefix_prompt_length = 0; // NOTE: Prefix Prompt PreProcessing // get prefix_prompt_weight for each batch --> shape [batch, beam_width] // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] - std::vector prefix_prompt_weight_batch_ptrs; - std::vector prefix_prompt_lengths; - int max_input_length = input_tensors->at("input_ids").shape[1]; - FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0), - "Prefix Prompt should come with inputs!"); // Prefix Soft Prompt - has_prefix_soft_prompt_ = false; - const size_t max_prefix_soft_prompt_length = 0; - const size_t limit_len_offset = 0 + (max_input_length == 0 ? 1 : 0); + const size_t limit_len_offset = (max_input_length == 0 ? 1 : 0); const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; const size_t max_seq_len = max_output_seq_len; // max cache seq len should include max prefix prompt length as it has k/v states - const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length; + const size_t max_cache_seq_len = max_output_seq_len; if (max_cache_seq_len < max_seq_len) { FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", @@ -437,8 +416,7 @@ void LLaMA::forward(std::unordered_map* output_ten max_seq_len); } const cudaDataType_t gemm_data_type = getCudaDataType(); - allocateBuffer( - batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + 0); + allocateBuffer(batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length); setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); sync_check_cuda_error(); @@ -472,8 +450,6 @@ void LLaMA::forward(std::unordered_map* output_ten cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); } - // Prefix prompts - sync_check_cuda_error(); // handle first step @@ -507,7 +483,7 @@ void LLaMA::forward(std::unordered_map* output_ten tiled_prompt_lengths_buf_, batch_size * beam_width, max_input_length, - max_prefix_prompt_length, + 0, stream_); sync_check_cuda_error(); @@ -523,19 +499,19 @@ void LLaMA::forward(std::unordered_map* output_ten {batch_size * beam_width, 1, (size_t)max_input_length, - (size_t)(max_input_length + max_prefix_prompt_length)}, + (size_t)(max_input_length)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}, {"d_prefix_prompt_batch", Tensor{MEMORY_GPU, data_type, {batch_size * beam_width}, - has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}}, + nullptr}}, {"d_prefix_prompt_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, - has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + nullptr}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", @@ -561,8 +537,7 @@ void LLaMA::forward(std::unordered_map* output_ten max_input_length - 1, stream_); sync_check_cuda_error(); - } - else if (max_input_length == 0) { + } else if (max_input_length == 0) { max_input_length++; invokeDecodingInitialize(finished_buf_, sequence_lengths_, @@ -580,8 +555,7 @@ void LLaMA::forward(std::unordered_map* output_ten cudaMemcpyHostToDevice, stream_); sync_check_cuda_error(); - } - else if (max_input_length == 1) { + } else if (max_input_length == 1) { invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -630,7 +604,7 @@ void LLaMA::forward(std::unordered_map* output_ten input_tensors->at("input_lengths").getPtr(), // not_tiled tiled_prompt_lengths_buf_, max_cache_seq_len, - max_input_length + max_prefix_prompt_length, + max_input_length, 0, batch_size, beam_width, @@ -685,7 +659,7 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size}, - has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}}, + nullptr}}, {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}}, {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, @@ -902,9 +876,9 @@ void LLaMA::forward(std::unordered_map* output_ten */ invokeUpdatePaddingCount(tiled_total_padding_count_, input_tensors->at("input_lengths").getPtr(), // not_tiled - has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr, + (const int*)nullptr, max_input_length, - has_prefix_prompt_ ? max_prefix_prompt_length : 0, + 0, batch_size, beam_width, stream_); @@ -1015,8 +989,7 @@ void LLaMA::setOutputTensors(std::unordered_map* o param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; param.end_tokens = end_ids_buf_; param.max_input_length = max_input_length; - param.prefix_soft_prompt_lengths = - has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr() : nullptr; + param.prefix_soft_prompt_lengths = nullptr; param.input_lengths = tiled_input_lengths_buf_; param.max_prefix_soft_prompt_length = 0; param.max_input_without_prompt_length = max_input_length; diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 48506f529..2f4f52c7b 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -60,9 +60,6 @@ class LLaMA: public BaseLayer { (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); - // Residual Type - const bool use_gptj_residual_ = true; - // Prompt Learning Parameters PromptLearningType prompt_learning_type_; int prompt_learning_start_id_; // start_id for prompt_learning (only needed by prefix prompts) @@ -145,7 +142,6 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim, int start_id, int end_id, - bool use_gptj_residual, unsigned long long random_seed, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -164,7 +160,6 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim, int start_id, int end_id, - bool use_gptj_residual, unsigned long long random_seed, NcclParam tensor_para, NcclParam pipeline_para, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 69ed839a3..ecf127ae6 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -18,49 +18,44 @@ #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" -#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" -#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" + namespace fastertransformer { template void LLaMAContextDecoder::initialize() { - self_attention_layer_ = new TensorParallelGptContextAttentionLayer(0, // max_batch_size - 0, // max_seq_len - head_num_, - size_per_head_, - rotary_embedding_dim_, - neox_rotary_style_, - tensor_para_, - stream_, - cublas_wrapper_, - allocator_, - !use_gptj_residual_, - is_free_buffer_after_forward_, - is_qk_buf_float_, - false, - 0, - custom_all_reduce_comm_, - enable_custom_all_reduce_); - - ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size - 0, // max_seq_len - head_num_, - size_per_head_, - 0, // expert_num - inter_size_, - tensor_para_, - stream_, - cublas_wrapper_, - allocator_, - !use_gptj_residual_, - is_free_buffer_after_forward_, - false, - 0, - false, // use_gated_activation = false; - custom_all_reduce_comm_, - enable_custom_all_reduce_); + self_attention_layer_ = new GptContextAttentionLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + head_num_, + rotary_embedding_dim_, + neox_rotary_style_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + is_qk_buf_float_, + false, + 0); + + ffn_layer_ = new GeluFfnLayer(0, // max_batch_size + 1, + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + false, + 0, + false // use_gated_activation = false + ); } template @@ -138,9 +133,7 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t size_t num_layer, size_t rotary_embedding_dim, bool neox_rotary_style, - bool use_gptj_residual, float layernorm_eps, - NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -157,10 +150,8 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t num_layer_(num_layer), rotary_embedding_dim_(rotary_embedding_dim), neox_rotary_style_(neox_rotary_style), - use_gptj_residual_(use_gptj_residual), layernorm_eps_(layernorm_eps), hidden_units_(head_num * size_per_head), - tensor_para_(tensor_para), pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type), @@ -179,10 +170,8 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode num_layer_(decoder.num_layer_), rotary_embedding_dim_(decoder.rotary_embedding_dim_), neox_rotary_style_(decoder.neox_rotary_style_), - use_gptj_residual_(decoder.use_gptj_residual_), layernorm_eps_(decoder.layernorm_eps_), hidden_units_(decoder.hidden_units_), - tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_), @@ -319,15 +308,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* } if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; - ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + int data_size = h_token_num * hidden_units_; + ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - if (tensor_para_.world_size_ > 1) { - ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); - } } invokeGeneralLayerNorm(decoder_normed_input_, @@ -396,20 +382,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* &llama_decoder_layer_weight->at(l)->self_attention_weights); if (is_final == false) { - if (use_gptj_residual_) { - invokeGeneralLayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - layernorm_eps_, - h_token_num, - hidden_units_, - (float*)nullptr, - 0, - stream_); - } - else { - invokeGeneralAddBiasResidualPreLayerNorm( + invokeGeneralAddBiasResidualPreLayerNorm( self_attn_output_, decoder_normed_input_, self_attn_output_, @@ -426,7 +399,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* (float*)nullptr, 0, stream_); - } TensorMap ffn_input_tensors( {{"ffn_input", @@ -435,47 +407,23 @@ void LLaMAContextDecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, - use_gptj_residual_ ? ffn_output_ : layer_output}}}); + layer_output}}}); ffn_layer_->forward( &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); - if (use_gptj_residual_) { - // Original workflow: - // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) - // Our workflow: - // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / - // TP_size) - // They are equivalent on math, but we can use same buffer for layer_input and layer_output - - invokeAddBiasAttentionFfnResidual(layer_output, - ffn_output_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, - hidden_units_, - tensor_para_.world_size_, - stream_); - if (tensor_para_.world_size_ > 1) { - ftNcclAllReduceSum( - layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_); - } - } - else { - invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, - hidden_units_, - stream_); - } + invokeAddBiasResidual(layer_output, + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); sync_check_cuda_error(); if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; - ftNcclSend(layer_output + data_size * tensor_para_.rank_, + int data_size = h_token_num * hidden_units_; + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index b84285f14..c9c474e49 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -42,13 +42,11 @@ class LLaMAContextDecoder: public BaseLayer { size_t num_layer_; size_t rotary_embedding_dim_; bool neox_rotary_style_; - bool use_gptj_residual_; float layernorm_eps_; // calculated data size_t hidden_units_; - NcclParam tensor_para_; NcclParam pipeline_para_; std::shared_ptr custom_all_reduce_comm_; @@ -88,9 +86,7 @@ class LLaMAContextDecoder: public BaseLayer { size_t num_layer, size_t rotary_embedding_dim, bool neox_rotary_style, - bool use_gptj_residual, float layernorm_eps, - NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc index 3a8fc1458..051744693 100644 --- a/src/fastertransformer/models/llama/LLaMADecoder.cc +++ b/src/fastertransformer/models/llama/LLaMADecoder.cc @@ -15,47 +15,43 @@ */ #include "src/fastertransformer/models/llama/LLaMADecoder.h" -#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" -#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h" namespace fastertransformer { template void LLaMADecoder::initialize() { - self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer(0, // max_batch_size - head_num_, - size_per_head_, - rotary_embedding_dim_, - neox_rotary_style_, - tensor_para_, - stream_, - cublas_wrapper_, - allocator_, - !use_gptj_residual_, - is_free_buffer_after_forward_, - false, - 0, - custom_all_reduce_comm_, - enable_custom_all_reduce_); - - ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size - 1, - head_num_, - size_per_head_, - 0, // expert_num - inter_size_, - tensor_para_, - stream_, - cublas_wrapper_, - allocator_, - !use_gptj_residual_, - is_free_buffer_after_forward_, - false, - 0, - false, // use_gated_activation = false; - custom_all_reduce_comm_, - enable_custom_all_reduce_); + self_attention_layer_ = new DecoderSelfAttentionLayer(0, // max_batch_size + head_num_, + size_per_head_, + head_num_, + rotary_embedding_dim_, + neox_rotary_style_, + head_num_ * size_per_head_, + 1.0f, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + false, + 0); + + ffn_layer_ = new GeluFfnLayer(0, // max_batch_size + 1, + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + false, + 0, + false // use_gated_activation = false + ); } template @@ -126,9 +122,7 @@ LLaMADecoder::LLaMADecoder(size_t head_num, size_t num_layer, size_t rotary_embedding_dim, bool neox_rotary_style, - bool use_gptj_residual, float layernorm_eps, - NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -143,10 +137,8 @@ LLaMADecoder::LLaMADecoder(size_t head_num, num_layer_(num_layer), rotary_embedding_dim_(rotary_embedding_dim), neox_rotary_style_(neox_rotary_style), - use_gptj_residual_(use_gptj_residual), layernorm_eps_(layernorm_eps), hidden_units_(head_num_ * size_per_head), - tensor_para_(tensor_para), pipeline_para_(pipeline_para), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) @@ -163,10 +155,8 @@ LLaMADecoder::LLaMADecoder(LLaMADecoder const& decoder): num_layer_(decoder.num_layer_), rotary_embedding_dim_(decoder.rotary_embedding_dim_), neox_rotary_style_(decoder.neox_rotary_style_), - use_gptj_residual_(decoder.use_gptj_residual_), layernorm_eps_(decoder.layernorm_eps_), hidden_units_(decoder.hidden_units_), - tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) @@ -247,18 +237,15 @@ void LLaMADecoder::forward(std::unordered_map* T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_; if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { - int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; + int data_size = local_batch_size * hidden_units_; // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, // stream_); - ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - if (tensor_para_.world_size_ > 1) { - ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); - } } invokeGeneralLayerNorm(decoder_normed_input_, @@ -293,22 +280,10 @@ void LLaMADecoder::forward(std::unordered_map* {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; self_attention_layer_->forward(&self_attention_output_tensors, - &self_attention_input_tensors, - &llama_decoder_layer_weight->at(l)->self_attention_weights); - if (use_gptj_residual_) { - invokeGeneralLayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - layernorm_eps_, - local_batch_size, - hidden_units_, - (float*)nullptr, - 0, - stream_); - } - else { - invokeGeneralAddBiasResidualPreLayerNorm( + &self_attention_input_tensors, + &llama_decoder_layer_weight->at(l)->self_attention_weights); + + invokeGeneralAddBiasResidualPreLayerNorm( self_attn_output_, decoder_normed_input_, self_attn_output_, @@ -325,7 +300,6 @@ void LLaMADecoder::forward(std::unordered_map* (float*)nullptr, 0, stream_); - } TensorMap ffn_input_tensors( {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}}); @@ -333,46 +307,22 @@ void LLaMADecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, - use_gptj_residual_ ? ffn_output_ : layer_output}}}); + layer_output}}}); ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); - if (use_gptj_residual_) { - // Original workflow: - // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) - // Our workflow: - // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size) - // They are equivalent on math, but we can use same buffer for layer_input and layer_output - invokeAddBiasAttentionFfnResidual(layer_output, - ffn_output_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - local_batch_size, - hidden_units_, - tensor_para_.world_size_, - stream_); - if (tensor_para_.world_size_ > 1) { - ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_); - } - } - else { - invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - local_batch_size, - hidden_units_, - stream_); - } + invokeAddBiasResidual(layer_output, + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + stream_); sync_check_cuda_error(); if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { - int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; - // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, - // stream_); - - ftNcclSend(layer_output + data_size * tensor_para_.rank_, + int data_size = local_batch_size * hidden_units_; + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h index cbbc272ff..773637d65 100644 --- a/src/fastertransformer/models/llama/LLaMADecoder.h +++ b/src/fastertransformer/models/llama/LLaMADecoder.h @@ -52,11 +52,9 @@ class LLaMADecoder: public BaseLayer { size_t num_layer_; size_t rotary_embedding_dim_; bool neox_rotary_style_; - bool use_gptj_residual_; size_t hidden_units_; float layernorm_eps_; - NcclParam tensor_para_; NcclParam pipeline_para_; std::shared_ptr custom_all_reduce_comm_; @@ -77,9 +75,7 @@ class LLaMADecoder: public BaseLayer { size_t num_layer, size_t rotary_embedding_dim, bool neox_rotary_style, - bool use_gptj_residual, float layernorm_eps, - NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc index 9ed355047..412a1d076 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -23,13 +23,11 @@ template LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, - const int tensor_para_rank, - const bool use_gptj_residual): + const int tensor_para_rank): hidden_units_(hidden_units), inter_size_(inter_size), tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank), - use_gptj_residual_(use_gptj_residual) + tensor_para_rank_(tensor_para_rank) { mallocWeights(); setWeightPtr(); @@ -40,7 +38,7 @@ LLaMADecoderLayerWeight::~LLaMADecoderLayerWeight() { if (is_maintain_buffer == true) { for (int i = 0; i < 12; i++) { - if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) { + if (i != attention_dense_bias_weight_id) { cudaFree(weights_ptr[i]); } } @@ -67,8 +65,7 @@ LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh hidden_units_(other.hidden_units_), inter_size_(other.inter_size_), tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_), - use_gptj_residual_(other.use_gptj_residual_) + tensor_para_rank_(other.tensor_para_rank_) { mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); @@ -76,9 +73,7 @@ LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - } + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); @@ -96,7 +91,6 @@ LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADec inter_size_ = other.inter_size_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; - use_gptj_residual_ = other.use_gptj_residual_; mallocWeights(); @@ -105,9 +99,7 @@ LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADec cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - } + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); @@ -143,10 +135,7 @@ void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType dir_path + ".attention.dense.weight." + rank_spec + ".bin", model_file_type); - if (!use_gptj_residual_) { - loadWeightFromBin( - weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type); - } + loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[6], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, @@ -160,14 +149,8 @@ void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", model_file_type); - if (use_gptj_residual_) { - loadWeightFromBin( - weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type); - } - else { - loadWeightFromBin( + loadWeightFromBin( weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type); - } loadWeightFromBin( weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type); loadWeightFromBin( @@ -182,7 +165,7 @@ void LLaMADecoderLayerWeight::setWeightPtr() self_attention_weights.query_weight.kernel = weights_ptr[2]; self_attention_weights.query_weight.bias = weights_ptr[3]; self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; - self_attention_weights.attention_output_weight.bias = use_gptj_residual_ ? nullptr : weights_ptr[5]; + self_attention_weights.attention_output_weight.bias = weights_ptr[5]; ffn_weights.intermediate_weight.kernel = weights_ptr[6]; ffn_weights.intermediate_weight.bias = weights_ptr[7]; @@ -202,9 +185,7 @@ void LLaMADecoderLayerWeight::mallocWeights() deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - deviceMalloc(&weights_ptr[5], hidden_units_); - } + deviceMalloc(&weights_ptr[5], hidden_units_); deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h index 44726f58c..4a6fc6a22 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h @@ -32,8 +32,7 @@ struct LLaMADecoderLayerWeight { LLaMADecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, - const int tensor_para_rank = 0, - const bool use_gptj_residual = true); + const int tensor_para_rank = 0); ~LLaMADecoderLayerWeight(); LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other); LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other); @@ -50,7 +49,6 @@ struct LLaMADecoderLayerWeight { int inter_size_; int tensor_para_size_; int tensor_para_rank_; - bool use_gptj_residual_; const int attention_dense_bias_weight_id = 5; bool is_maintain_buffer = false; T* weights_ptr[12]; diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index dddf6eff6..f0bdc282f 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -23,31 +23,26 @@ LLaMAWeight::LLaMAWeight(const int hidden_un const int inter_size, const int vocab_size, const int num_layer, - const int max_seq_len, const int tensor_para_size, const int tensor_para_rank, const int layer_para_size, - const int layer_para_rank, - const bool use_gptj_residual): + const int layer_para_rank): hidden_units_(hidden_units), inter_size_(inter_size), vocab_size_(vocab_size), num_layer_(num_layer), - max_seq_len_(max_seq_len), tensor_para_size_(tensor_para_size), tensor_para_rank_(tensor_para_rank), layer_para_size_(layer_para_size), - layer_para_rank_(layer_para_rank), - use_gptj_residual_(use_gptj_residual) + layer_para_rank_(layer_para_rank) { FT_CHECK(num_layer_ % layer_para_size_ == 0); - decoder_layer_weights.reserve(num_layer_); for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights.push_back(new LLaMADecoderLayerWeight( - hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_)); } else { // Layer-parallelism: allocate empty layer because @@ -82,12 +77,10 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): inter_size_(other.inter_size_), vocab_size_(other.vocab_size_), num_layer_(other.num_layer_), - max_seq_len_(other.max_seq_len_), tensor_para_size_(other.tensor_para_size_), tensor_para_rank_(other.tensor_para_rank_), layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), - use_gptj_residual_(other.use_gptj_residual_), prompt_token_weight_size_(other.prompt_token_weight_size_) { mallocWeights(); @@ -113,12 +106,10 @@ LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) inter_size_ = other.inter_size_; vocab_size_ = other.vocab_size_; num_layer_ = other.num_layer_; - max_seq_len_ = other.max_seq_len_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; layer_para_size_ = other.layer_para_size_; layer_para_rank_ = other.layer_para_rank_; - use_gptj_residual_ = other.use_gptj_residual_; prompt_token_weight_size_ = other.prompt_token_weight_size_; mallocWeights(); @@ -169,6 +160,7 @@ void LLaMAWeight::loadModel(std::string dir_path) weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type); loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type); + loadWeightFromBin( weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type); loadWeightFromBin(weights_ptr[3], diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h index 5f3c071e6..b372139e2 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.h +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -32,12 +32,10 @@ struct LLaMAWeight { const int inter_size, const int vocab_size, const int num_layer, - const int max_seq_len, const int tensor_para_size = 1, const int tensor_para_rank = 0, const int layer_para_size = 1, - const int layer_para_rank = 0, - const bool use_gptj_residual_ = true); + const int layer_para_rank = 0); ~LLaMAWeight(); LLaMAWeight(const LLaMAWeight& other); @@ -49,18 +47,11 @@ struct LLaMAWeight { std::vector*> decoder_layer_weights; const T* pre_decoder_embedding_table = nullptr; - // GPT-J does not use embedding table, but we leave the ptr such that - // LLaMA::forward and Gpt::forward become identical const T* position_encoding_table = nullptr; LayerNormWeight post_decoder_layernorm; DenseWeight post_decoder_embedding; - inline void setMaxSeqLen(size_t max_seq_len) - { - max_seq_len_ = max_seq_len; - } - private: void setWeightPtr(); void mallocWeights(); @@ -70,16 +61,12 @@ struct LLaMAWeight { int inter_size_; int vocab_size_; int num_layer_; - int max_seq_len_; int tensor_para_size_; int tensor_para_rank_; int layer_para_size_; int layer_para_rank_; - // residual type - bool use_gptj_residual_; - // prompt learning pair (task_name, (task_name_id, prompt_len)) // each prompt token's weight size size_t prompt_token_weight_size_ = 0; From ca0a25a4dabad2b0aa935aa15b342ecc13de05ca Mon Sep 17 00:00:00 2001 From: dypshong Date: Wed, 13 Sep 2023 13:59:03 +0000 Subject: [PATCH 07/55] remove gpt dependency --- examples/cpp/llama/CMakeLists.txt | 6 +- examples/cpp/llama/llama_example.cc | 4 +- examples/cpp/llama/llama_example_utils.cc | 95 +++++++++++++++++++++++ examples/cpp/llama/llama_example_utils.h | 31 ++++++++ 4 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 examples/cpp/llama/llama_example_utils.cc create mode 100644 examples/cpp/llama/llama_example_utils.h diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt index ce0bee75f..19fb6e7fc 100644 --- a/examples/cpp/llama/CMakeLists.txt +++ b/examples/cpp/llama/CMakeLists.txt @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +add_library(llama_example_utils STATIC llama_example_utils.cc) +target_link_libraries(llama_example_utils PUBLIC -lcublas -lcublasLt -lcudart + nvtx_utils mpi_utils nccl_utils) + add_executable(llama_example llama_example.cc) target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart LLaMA mpi_utils nccl_utils nvtx_utils - gpt_example_utils word_list) + llama_example_utils word_list) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 62919d57a..4d0d60a93 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -19,11 +19,9 @@ #include "src/fastertransformer/utils/nccl_utils.h" #include "src/fastertransformer/utils/nvtx_utils.h" #include "src/fastertransformer/utils/word_list.h" +#include "examples/cpp/llama/llama_example_utils.h" #include "3rdparty/INIReader.h" -// Remove LATER -#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" - #include #include #include diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc new file mode 100644 index 000000000..77f621dbf --- /dev/null +++ b/examples/cpp/llama/llama_example_utils.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "examples/cpp/llama/llama_example_utils.h" + +#include +#include +#include +#include + +namespace fastertransformer { + +int read_start_ids(size_t batch_size, + std::vector* v_start_lengths, + std::vector* v_start_ids, + size_t& max_input_len, + const int end_id, + const int beam_width, + std::string file_name) +{ + std::vector> tmp_start_ids; + std::vector tmp_start_lengths; + + std::ifstream start_id_file(file_name, std::ios::in); + int line_num = 0; + if (start_id_file.is_open()) { + std::string line; + while (std::getline(start_id_file, line)) { + std::stringstream lineStream(line); + std::string vals; + int i1 = 0; + std::vector tmp_vec; + while (std::getline(lineStream, vals, ',')) { + tmp_vec.push_back(std::stoi(vals)); + i1++; + } + tmp_start_ids.push_back(tmp_vec); + tmp_start_lengths.push_back(i1); + line_num++; + } + if (batch_size == 0) { + batch_size = line_num; + } + } + else { + printf("[WARNING] Cannot open the file '%s'. \n", file_name.c_str()); + max_input_len = 0; + return 0; + } + + max_input_len = tmp_start_lengths.data()[0]; + for (uint i = 1; i < (uint)tmp_start_lengths.size(); i++) { + max_input_len = max_input_len > tmp_start_lengths.data()[i] ? max_input_len : tmp_start_lengths.data()[i]; + } + + while ((int)tmp_start_lengths.size() < batch_size) { + std::vector padding_ids; + for (int i = 0; i < max_input_len; i++) { + padding_ids.push_back(end_id); + } + tmp_start_ids.push_back(padding_ids); + tmp_start_lengths.push_back(max_input_len); + } + + // Add padding + for (int i = 0; i < (int)tmp_start_ids.size(); i++) { + for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) { + tmp_start_ids[i].push_back(end_id); + } + } + + for (int i = 0; i < (int)tmp_start_ids.size(); i++) { + for (int b = 0; b < beam_width; b++) { + for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) { + v_start_ids->push_back(tmp_start_ids[i][j]); + } + v_start_lengths->push_back(tmp_start_lengths[i]); + } + } + return batch_size; +} + +} // namespace fastertransformer diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h new file mode 100644 index 000000000..911cdf49a --- /dev/null +++ b/examples/cpp/llama/llama_example_utils.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace fastertransformer { + +int read_start_ids(size_t batch_size, + std::vector* v_start_lengths, + std::vector* v_start_ids, + size_t& max_input_len, + const int end_id, + const int beam_width, + std::string file_name); +} // namespace fastertransformer From 662d3b6cd2ccf7ed5c4222eea259bd1dd2278e7b Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 15 Sep 2023 08:46:19 +0000 Subject: [PATCH 08/55] fix loadModel to load llama & fix invokeGeneralLLaMALayerNorm to invoke RMSNorm --- examples/cpp/llama/llama_example.cc | 47 +- examples/cpp/llama/llama_example_utils.h | 1 + .../kernels/layernorm_kernels.cu | 94 ++++ .../kernels/layernorm_kernels.h | 13 +- src/fastertransformer/models/llama/LLaMA.cc | 408 ++++++------------ src/fastertransformer/models/llama/LLaMA.h | 100 ++--- .../models/llama/LLaMAContextDecoder.cc | 167 +++---- .../models/llama/LLaMAContextDecoder.h | 38 +- .../models/llama/LLaMADecoder.cc | 129 +++--- .../models/llama/LLaMADecoderLayerWeight.cc | 186 ++++---- .../models/llama/LLaMADecoderLayerWeight.h | 9 +- .../models/llama/LLaMAWeight.cc | 61 ++- .../models/llama/LLaMAWeight.h | 4 - 13 files changed, 579 insertions(+), 678 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 4d0d60a93..c1f4521bf 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -14,13 +14,13 @@ * limitations under the License. */ +#include "3rdparty/INIReader.h" +#include "examples/cpp/llama/llama_example_utils.h" #include "src/fastertransformer/models/llama/LLaMA.h" #include "src/fastertransformer/utils/mpi_utils.h" #include "src/fastertransformer/utils/nccl_utils.h" #include "src/fastertransformer/utils/nvtx_utils.h" #include "src/fastertransformer/utils/word_list.h" -#include "examples/cpp/llama/llama_example_utils.h" -#include "3rdparty/INIReader.h" #include #include @@ -71,9 +71,9 @@ int main(int argc, char* argv[]) template void llama_example(const INIReader reader) { - const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); - std::string model_dir = std::string(reader.Get("ft_instance_hyperparameter", "model_dir")); - int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); + std::string model_dir = std::string(reader.Get("ft_instance_hyperparameter", "model_dir")); + int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); const size_t head_num = reader.GetInteger(model_name, "head_num"); const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); @@ -85,7 +85,7 @@ void llama_example(const INIReader reader) const int end_id = reader.GetInteger(model_name, "end_id"); const size_t hidden_units = head_num * size_per_head; - const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of -1) / multiple_of); + const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of); const size_t beam_width = reader.GetInteger("request", "beam_width"); const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); @@ -207,8 +207,6 @@ void llama_example(const INIReader reader) inter_size, vocab_size, decoder_layers, - tensor_para.world_size_, - tensor_para.rank_, pipeline_para.world_size_, pipeline_para.rank_); @@ -231,22 +229,22 @@ void llama_example(const INIReader reader) true); // causal_mask LLaMA llama = LLaMA(head_num, - size_per_head, - inter_size, - decoder_layers, - vocab_size, - rotary_embedding_dim, - start_id, - end_id, - random_seed, - tensor_para, - pipeline_para, - stream, - &cublas_wrapper, - &allocator, - false, - &prop, - attention_type); + size_per_head, + inter_size, + decoder_layers, + vocab_size, + rotary_embedding_dim, + start_id, + end_id, + random_seed, + tensor_para, + pipeline_para, + stream, + &cublas_wrapper, + &allocator, + false, + &prop, + attention_type); int* d_output_ids; int* d_sequence_lengths; @@ -265,7 +263,6 @@ void llama_example(const INIReader reader) {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); std::unordered_map output_tensors = std::unordered_map{ diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h index 911cdf49a..1e5d0b9ab 100644 --- a/examples/cpp/llama/llama_example_utils.h +++ b/examples/cpp/llama/llama_example_utils.h @@ -28,4 +28,5 @@ int read_start_ids(size_t batch_size, const int end_id, const int beam_width, std::string file_name); + } // namespace fastertransformer diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index 369030b37..b19e9ac73 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -1859,6 +1859,100 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16* out, cudaStream_t stream); #endif +/******************* invokeGeneralLLaMALayerNorm ***********************/ + +template +__global__ void generalLLaMALayerNorm(const T* __restrict input, + const T* __restrict gamma, + const T* __restrict beta, + T* normed_output, + const float layernorm_eps, + int m, + int n) +{ + const int tid = threadIdx.x; + + extern __shared__ __align__(sizeof(float)) char _shmem[]; + T* shmem = reinterpret_cast(_shmem); + + __shared__ float s_mean_sq; + float mean_sq = 0.0f; + + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + float local_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float val = (float)(ldg(&input[blockIdx.x * n + i])); + local_sum += val * val; + } + + mean_sq = blockReduceSum(local_sum); + + if (threadIdx.x == 0) { + s_mean_sq = rsqrtf(mean_sq / (float)n + layernorm_eps); + } + __syncthreads(); + + for (int i = tid; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + float beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]); + T val = (T)(((float)input[index] * s_mean_sq) * (float)(ldg(&gamma[i])) + beta_val); + + normed_output[index] = val; + } +} + +template +void invokeGeneralLLaMALayerNorm(T* out, + const T* input, + const T* gamma, + const T* beta, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream) +{ + dim3 grid(m); + dim3 block(min(n, 1024)); + + /* For general cases, n is equal to hidden_units, e.g., 512/1024. + Since we have warp shuffle inside the code, block.x % 32 should be 0. + */ + if (n % 32 != 0) { + block.x = 1024; + } + + generalLLaMALayerNorm<<>>(input, gamma, beta, out, layernorm_eps, m, n); +} + +template void invokeGeneralLLaMALayerNorm(float* out, + const float* input, + const float* gamma, + const float* beta, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +template void invokeGeneralLLaMALayerNorm(half* out, + const half* input, + const half* gamma, + const half* beta, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeGeneralLLaMALayerNorm(__nv_bfloat16* out, + const __nv_bfloat16* input, + const __nv_bfloat16* gamma, + const __nv_bfloat16* beta, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +#endif + /******************* invokeLayernormShiftPartition ***********************/ // applied to half2 and bfloat162 diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h index d8ac09234..5c5c03c7a 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.h +++ b/src/fastertransformer/kernels/layernorm_kernels.h @@ -24,7 +24,8 @@ namespace fastertransformer { -enum class LayerNormType { +enum class LayerNormType +{ pre_layernorm, post_layernorm, InvalidType @@ -161,6 +162,16 @@ void invokeGeneralT5LayerNorm(T* out, const int n, cudaStream_t stream); +template +void invokeGeneralLLaMALayerNorm(T* out, + const T* input, + const T* gamma, + const T* beta, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); + template void invokeGeneralAddResidualT5PreLayerNorm(T* output, T* norm_output, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 3734b63d5..9fcab580b 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -16,8 +16,8 @@ #include "src/fastertransformer/models/llama/LLaMA.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" -#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h" +#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include @@ -59,7 +59,7 @@ void LLaMA::initialize() enable_custom_all_reduce_); dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, - vocab_size_padded_, + vocab_size_, 0, // end_id, deprecated stream_, cublas_wrapper_, @@ -79,29 +79,24 @@ void LLaMA::allocateBuffer( size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t batchxbeam = batch_size * beam_width; - const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len - * hidden_units_ / tensor_para_.world_size_; - - if (vocab_size_ != vocab_size_padded_) { - padded_embedding_kernel_ = - (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true)); - padded_embedding_kernel_ptr_ = padded_embedding_kernel_; - - padded_embedding_bias_ = - (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true)); - } - - input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); - decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); - nccl_logits_buf_ = (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); - cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); - finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); - h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + const size_t batchxbeam = batch_size * beam_width; + const size_t self_cache_size = + (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len * hidden_units_; + + input_attention_mask_ = (T*)(allocator_->reMalloc( + input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + decoder_output_buf_ = + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false)); + nccl_logits_buf_ = + (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); + h_finished_buf_ = new bool[batchxbeam]; + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; @@ -111,12 +106,6 @@ void LLaMA::allocateBuffer( cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len; } - // prompt_learning weight batch ptrs - prompt_learning_weight_batch_ = - (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); - tiled_prompt_lengths_buf_ = - (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); - tiled_input_ids_buf_ = (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); @@ -149,12 +138,6 @@ template void LLaMA::freeBuffer() { if (is_allocate_buffer_) { - if (vocab_size_ != vocab_size_padded_) { - padded_embedding_kernel_ptr_ = nullptr; - allocator_->free((void**)(&padded_embedding_kernel_)); - allocator_->free((void**)(&padded_embedding_bias_)); - } - allocator_->free((void**)(&input_attention_mask_)); allocator_->free((void**)(&decoder_input_buf_)); allocator_->free((void**)(&decoder_output_buf_)); @@ -171,9 +154,6 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&cache_indirections_)[0]); } - allocator_->free((void**)(&prompt_learning_weight_batch_)); - allocator_->free((void**)(&tiled_prompt_lengths_buf_)); - allocator_->free((void**)(&tiled_input_ids_buf_)); allocator_->free((void**)(&tiled_input_lengths_buf_)); allocator_->free((void**)(&tiled_total_padding_count_)); @@ -199,22 +179,22 @@ void LLaMA::freeBuffer() template LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - int start_id, - int end_id, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + unsigned long long random_seed, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -225,42 +205,33 @@ LLaMA::LLaMA(size_t head_num, start_id_(start_id), end_id_(end_id), hidden_units_(head_num * size_per_head), - local_head_num_(head_num / 1), attention_type_(attention_type) { - tensor_para_.world_size_ = 1; - tensor_para_.rank_ = 0; pipeline_para_.world_size_ = 1; pipeline_para_.rank_ = 0; - - int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); - if (std::is_same::value) { - local_vacab_size = ceil(local_vacab_size / 8.f) * 8; - } - vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; initialize(); } template LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - int start_id, - int end_id, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + unsigned long long random_seed, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -271,18 +242,11 @@ LLaMA::LLaMA(size_t head_num, start_id_(start_id), end_id_(end_id), hidden_units_(head_num * size_per_head), - tensor_para_(tensor_para), pipeline_para_(pipeline_para), - local_head_num_(head_num / tensor_para.world_size_), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce), attention_type_(attention_type) { - int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); - if (std::is_same::value) { - local_vacab_size = ceil(local_vacab_size / 8.f) * 8; - } - vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; initialize(); } @@ -297,12 +261,8 @@ LLaMA::LLaMA(LLaMA const& llama): rotary_embedding_dim_(llama.rotary_embedding_dim_), start_id_(llama.start_id_), end_id_(llama.end_id_), - prompt_learning_start_id_(llama.prompt_learning_start_id_), hidden_units_(llama.hidden_units_), - tensor_para_(llama.tensor_para_), pipeline_para_(llama.pipeline_para_), - local_head_num_(llama.local_head_num_), - vocab_size_padded_(llama.vocab_size_padded_), custom_all_reduce_comm_(llama.custom_all_reduce_comm_), enable_custom_all_reduce_(llama.enable_custom_all_reduce_), attention_type_(llama.attention_type_) @@ -335,16 +295,16 @@ void LLaMA::unRegisterCallback() template void LLaMA::forward(std::vector* output_tensors, - const std::vector* input_tensors, - const LLaMAWeight* llama_weights) + const std::vector* input_tensors, + const LLaMAWeight* llama_weights) { FT_CHECK(false); } template void LLaMA::forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const LLaMAWeight* llama_weights) + const std::unordered_map* input_tensors, + const LLaMAWeight* llama_weights) { // input_tensors: // input_ids [batch_size, max_input_length] @@ -385,13 +345,6 @@ void LLaMA::forward(std::unordered_map* output_ten const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; - - // Prefix Prompt Inputs - // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes - // pad) - // TODO (perkzz): move unnecessary paddings - int max_prefix_prompt_length = 0; - // NOTE: Prefix Prompt PreProcessing // get prefix_prompt_weight for each batch --> shape [batch, beam_width] // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] @@ -431,15 +384,12 @@ void LLaMA::forward(std::unordered_map* output_ten const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, - local_head_num_, + head_num_, size_per_head_ / (16 / sizeof(T)), max_cache_seq_len, 16 / sizeof(T)}; - const std::vector self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_, - batch_size * beam_width, - local_head_num_, - max_cache_seq_len, - size_per_head_}; + const std::vector self_v_cache_shape = { + num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, head_num_, max_cache_seq_len, size_per_head_}; // initialize the output ids and parent ids cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); @@ -452,8 +402,11 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); + std::cout << __FILE__ << ":" << __LINE__ << "\n"; + // handle first step if (max_input_length > 1) { + std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, input_tensors->at("input_ids").getPtr(), @@ -465,22 +418,22 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - output_ids_buf_, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - pPromptTuningParam{}, // no p/prompt tuning - tiled_input_ids_buf_, - 1, - max_input_length, - max_input_length, - batch_size * beam_width, - hidden_units_, - stream_); + output_ids_buf_, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size * beam_width, + hidden_units_, + stream_); sync_check_cuda_error(); invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, - tiled_prompt_lengths_buf_, + nullptr, batch_size * beam_width, max_input_length, 0, @@ -496,22 +449,9 @@ void LLaMA::forward(std::unordered_map* output_ten {"attention_mask", Tensor{MEMORY_GPU, data_type, - {batch_size * beam_width, - 1, - (size_t)max_input_length, - (size_t)(max_input_length)}, + {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)(max_input_length)}, input_attention_mask_}}, - {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}, - {"d_prefix_prompt_batch", - Tensor{MEMORY_GPU, - data_type, - {batch_size * beam_width}, - nullptr}}, - {"d_prefix_prompt_lengths", - Tensor{MEMORY_GPU, - TYPE_INT32, - {batch_size * beam_width}, - nullptr}}}; + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", @@ -524,9 +464,11 @@ void LLaMA::forward(std::unordered_map* output_ten {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; + std::cout << __FILE__ << ":" << __LINE__ << "\n"; llama_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); + std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -537,7 +479,8 @@ void LLaMA::forward(std::unordered_map* output_ten max_input_length - 1, stream_); sync_check_cuda_error(); - } else if (max_input_length == 0) { + } + else if (max_input_length == 0) { max_input_length++; invokeDecodingInitialize(finished_buf_, sequence_lengths_, @@ -555,7 +498,8 @@ void LLaMA::forward(std::unordered_map* output_ten cudaMemcpyHostToDevice, stream_); sync_check_cuda_error(); - } else if (max_input_length == 1) { + } + else if (max_input_length == 1) { invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -582,27 +526,11 @@ void LLaMA::forward(std::unordered_map* output_ten cudaMemcpyDeviceToDevice, stream_); } - - if (vocab_size_ == vocab_size_padded_) { - padded_embedding_kernel_ptr_ = llama_weights->post_decoder_embedding.kernel; - } - else { - cudaMemcpyAsync(padded_embedding_kernel_, - llama_weights->post_decoder_embedding.kernel, - sizeof(T) * vocab_size_ * hidden_units_, - cudaMemcpyDeviceToDevice, - stream_); - cudaMemcpyAsync(padded_embedding_bias_, - llama_weights->post_decoder_embedding.bias, - sizeof(T) * vocab_size_, - cudaMemcpyDeviceToDevice, - stream_); - sync_check_cuda_error(); - } + std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeMaskPaddingTokens(masked_tokens_, input_tensors->at("input_lengths").getPtr(), // not_tiled - tiled_prompt_lengths_buf_, + nullptr, max_cache_seq_len, max_input_length, 0, @@ -611,6 +539,7 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); for (int step = max_input_length; step < (int)max_output_seq_len; step++) { + std::cout << __FILE__ << ":" << __LINE__ << "\n"; const int src_indir_idx = (step - max_input_length) % 2; const int tgt_indir_idx = 1 - src_indir_idx; @@ -622,7 +551,7 @@ void LLaMA::forward(std::unordered_map* output_ten for (uint ite = 0; ite < iteration_num; ++ite) { const int id_offset = ite * local_batch_size * beam_width; const int hidden_units_offset = id_offset * hidden_units_; - const int vocab_size_units_offset = id_offset * vocab_size_padded_; + const int vocab_size_units_offset = id_offset * vocab_size_; if (!(max_input_length > 1 && step == max_input_length)) { if (pipeline_para_.rank_ == 0) { @@ -655,12 +584,6 @@ void LLaMA::forward(std::unordered_map* output_ten TYPE_INT32, {local_batch_size * beam_width}, tiled_total_padding_count_ + id_offset}}, - {"d_prefix_prompt_lengths", - Tensor{MEMORY_GPU, - TYPE_INT32, - {local_batch_size}, - nullptr}}, - {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}}, {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}, @@ -688,85 +611,42 @@ void LLaMA::forward(std::unordered_map* output_ten } if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { - invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset, - decoder_output_buf_ + hidden_units_offset, - llama_weights->post_decoder_layernorm.gamma, - llama_weights->post_decoder_layernorm.beta, - layernorm_eps_, - local_batch_size * beam_width, - hidden_units_, - (float*)nullptr, - 0, - stream_); + invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_ + hidden_units_offset, + decoder_output_buf_ + hidden_units_offset, + llama_weights->post_decoder_layernorm.gamma, + llama_weights->post_decoder_layernorm.beta, + layernorm_eps_, + local_batch_size * beam_width, + hidden_units_, + stream_); sync_check_cuda_error(); - if (tensor_para_.world_size_ == 1) { - float alpha = 1.0f; - float beta = 0.0f; - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - vocab_size_padded_, // n - local_batch_size * beam_width, - hidden_units_, // k - &alpha, - padded_embedding_kernel_ptr_, - gemm_data_type, - hidden_units_, // k - normed_decoder_output_buf_ + hidden_units_offset, - gemm_data_type, - hidden_units_, // k - &beta, - logits_buf_ + vocab_size_units_offset, - CUDA_R_32F, - vocab_size_padded_, /* n */ - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - } - else { - FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); - const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; - float alpha = 1.0f; - float beta = 0.0f; - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - local_vocab_size, // n - local_batch_size * beam_width, - hidden_units_, // k - &alpha, - padded_embedding_kernel_ptr_ - + tensor_para_.rank_ * local_vocab_size * hidden_units_, - gemm_data_type, - hidden_units_, // k - normed_decoder_output_buf_ + hidden_units_offset, - gemm_data_type, - hidden_units_, // k - &beta, - nccl_logits_buf_ + vocab_size_units_offset - + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, - CUDA_R_32F, - local_vocab_size, /* n */ - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, - nccl_logits_buf_ + vocab_size_units_offset, - local_batch_size * beam_width * local_vocab_size, - tensor_para_.rank_, - tensor_para_, - stream_); - invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, - nccl_logits_buf_ + vocab_size_units_offset, - tensor_para_.world_size_, - local_batch_size * beam_width, - local_vocab_size, - stream_); - } + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + vocab_size_, + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + llama_weights->post_decoder_embedding.kernel, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + logits_buf_ + vocab_size_units_offset, + CUDA_R_32F, + vocab_size_, + CUDA_R_32F, + cublasGemmAlgo_t(-1)); int tmp_local_batch_size = local_batch_size; bool is_initialize_random_table = step == max_input_length; std::unordered_map dynamic_decode_input_tensors{ - {"logits", - Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}}, - // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}}, + {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_}, logits_buf_}}, + // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_}, nullptr}}, {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, {"input_lengths", @@ -854,7 +734,8 @@ void LLaMA::forward(std::unordered_map* output_ten } ftNcclGroupEnd(); // throw errors when detected - ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + NcclParam tensor_para(0, 1); + ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_); sync_check_cuda_error(); } @@ -865,14 +746,13 @@ void LLaMA::forward(std::unordered_map* output_ten setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); - if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) { + if (pipeline_para_.rank_ == 0) { token_generated_cb_(output_tensors, token_generated_ctx_); } } if (step == max_input_length) { /* We have just finished processing input: update the padding count: * total_padding_count += (max_input_length - input_lengths) - * if has prefix prompts, += (max_prefix_prompt_length - prompt_length) */ invokeUpdatePaddingCount(tiled_total_padding_count_, input_tensors->at("input_lengths").getPtr(), // not_tiled @@ -884,6 +764,7 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); } } + std::cout << __FILE__ << ":" << __LINE__ << "\n"; setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); @@ -891,15 +772,16 @@ void LLaMA::forward(std::unordered_map* output_ten template void LLaMA::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors) + const std::unordered_map* input_tensors) { + NcclParam tensor_para(0, 1); + FT_LOG_DEBUG(__PRETTY_FUNCTION__); if (pipeline_para_.world_size_ == 1) { // throw errors when detected - ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_); return; } - const auto pp_rank = pipeline_para_.rank_; ftNcclGroupStart(); @@ -921,14 +803,14 @@ void LLaMA::sendTensorsToFirstPipelineNode(std::unordered_map void LLaMA::setOutputTensors(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const size_t max_input_length, - const size_t max_output_seq_len) + const std::unordered_map* input_tensors, + const size_t max_input_length, + const size_t max_output_seq_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) { @@ -981,15 +863,15 @@ void LLaMA::setOutputTensors(std::unordered_map* o param.beams = transposed_output_ids_buf_; param.max_sequence_lengths = sequence_lengths_; // add sequence_length 1 here because the sequence_length of time step t is t - 1 - param.max_sequence_length_final_step = 1; - param.max_time = max_output_seq_len; - param.batch_size = batch_size; - param.beam_width = beam_width; - param.step_ids = output_ids_buf_; - param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; - param.end_tokens = end_ids_buf_; - param.max_input_length = max_input_length; - param.prefix_soft_prompt_lengths = nullptr; + param.max_sequence_length_final_step = 1; + param.max_time = max_output_seq_len; + param.batch_size = batch_size; + param.beam_width = beam_width; + param.step_ids = output_ids_buf_; + param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; + param.end_tokens = end_ids_buf_; + param.max_input_length = max_input_length; + param.prefix_soft_prompt_lengths = nullptr; param.input_lengths = tiled_input_lengths_buf_; param.max_prefix_soft_prompt_length = 0; param.max_input_without_prompt_length = max_input_length; @@ -1029,18 +911,6 @@ size_t LLaMA::getPipelineParallelSize() return pipeline_para_.world_size_; } -template -size_t LLaMA::getTensorParallelRank() -{ - return tensor_para_.rank_; -} - -template -size_t LLaMA::getTensorParallelSize() -{ - return tensor_para_.world_size_; -} - template bool* LLaMA::getFinishBuffer() { diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 2f4f52c7b..7a66a2ebf 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -24,7 +24,6 @@ #include "src/fastertransformer/models/llama/LLaMADecoder.h" #include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" -#include "src/fastertransformer/utils/prompt_learning.h" namespace fastertransformer { @@ -40,13 +39,12 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim_; static constexpr bool neox_rotary_style_ = true; - static constexpr float layernorm_eps_ = 1e-5f; + static constexpr float layernorm_eps_ = 1e-6f; int start_id_; int end_id_; size_t hidden_units_; - size_t local_head_num_; NcclParam tensor_para_; NcclParam pipeline_para_; @@ -55,19 +53,11 @@ class LLaMA: public BaseLayer { AttentionType attention_type_; - size_t vocab_size_padded_; - const bool is_context_qk_buf_float_ = - (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || - std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); + const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr + || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); - // Prompt Learning Parameters - PromptLearningType prompt_learning_type_; - int prompt_learning_start_id_; // start_id for prompt_learning (only needed by prefix prompts) - bool has_prefix_prompt_; - bool has_prefix_soft_prompt_; - - LLaMADecoder* llama_decoder_; - LLaMAContextDecoder* llama_context_decoder_; + LLaMADecoder* llama_decoder_; + LLaMAContextDecoder* llama_context_decoder_; DynamicDecodeLayer* dynamic_decode_layer_; void allocateBuffer() override; @@ -78,10 +68,6 @@ class LLaMA: public BaseLayer { void initialize(); protected: - T* padded_embedding_kernel_; - T* padded_embedding_bias_; - const T* padded_embedding_kernel_ptr_; - T* input_attention_mask_; T* decoder_input_buf_; @@ -102,10 +88,6 @@ class LLaMA: public BaseLayer { T* value_cache_; int* cache_indirections_[2] = {nullptr, nullptr}; - // prompt_learning weight_batch ptrs - const T** prompt_learning_weight_batch_; - int* tiled_prompt_lengths_buf_; // only needed by prefix prompts - int* tiled_input_ids_buf_; int* tiled_input_lengths_buf_; int* transposed_output_ids_buf_; @@ -135,42 +117,42 @@ class LLaMA: public BaseLayer { public: LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - int start_id, - int end_id, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + unsigned long long random_seed, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - int start_id, - int end_id, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + unsigned long long random_seed, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); LLaMA(LLaMA const& LLaMA); @@ -178,11 +160,11 @@ class LLaMA: public BaseLayer { void forward(std::vector* output_tensors, const std::vector* input_tensors, - const LLaMAWeight* llama_weights); + const LLaMAWeight* llama_weights); void forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, - const LLaMAWeight* llama_weights); + const LLaMAWeight* llama_weights); size_t getPipelineParallelRank(); size_t getPipelineParallelSize(); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index ecf127ae6..e8f4a4e21 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -21,7 +21,6 @@ #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" - namespace fastertransformer { template @@ -42,8 +41,8 @@ void LLaMAContextDecoder::initialize() false, 0); - ffn_layer_ = new GeluFfnLayer(0, // max_batch_size - 1, + ffn_layer_ = new GeluFfnLayer(0, // max_batch_size + 1, head_num_, size_per_head_, 0, // expert_num @@ -55,7 +54,7 @@ void LLaMAContextDecoder::initialize() false, 0, false // use_gated_activation = false - ); + ); } template @@ -128,21 +127,21 @@ int LLaMAContextDecoder::getFirstLayerParallelId() template LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - bool neox_rotary_style, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + float layernorm_eps, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), head_num_(head_num), size_per_head_(size_per_head), @@ -190,9 +189,9 @@ LLaMAContextDecoder::~LLaMAContextDecoder() } template -void LLaMAContextDecoder::forward(std::vector* output_tensors, - const std::vector* input_tensors, - const std::vector*>* llama_decoder_layer_weight) +void LLaMAContextDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* llama_decoder_layer_weight) { std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, {"attention_mask", input_tensors->at(1)}, @@ -206,17 +205,14 @@ void LLaMAContextDecoder::forward(std::vector* } template -void LLaMAContextDecoder::forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const std::vector*>* llama_decoder_layer_weight) +void LLaMAContextDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* llama_decoder_layer_weight) { // input tensors: // decoder_input [batch_size, seq_len, hidden_dimension], // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] // input_lengths [batch_size] - // d_prefix_prompt_batch [batch_size], - // each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head] - // prefix_prompt_lengths [batch size] // output tensors: // decoder_output [batch_size, seq_len, hidden_dimension], @@ -228,7 +224,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 5); + FT_CHECK(input_tensors->size() == 3); FT_CHECK(output_tensors->size() == 4); const int batch_size = input_tensors->at("decoder_input").shape[0]; @@ -238,13 +234,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* const DataType data_type = getTensorType(); allocateBuffer(batch_size, seq_len); - T* decoder_input = input_tensors->at("decoder_input").getPtr(); - T* decoder_output = output_tensors->at("decoder_output").getPtr(); - const T* attention_mask = input_tensors->at("attention_mask").getPtr(); - const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); - const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + const T* attention_mask = input_tensors->at("attention_mask").getPtr(); - const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); + // const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); + const int local_batch_size = batch_size; FT_CHECK(batch_size % local_batch_size == 0); const int iteration_num = batch_size / local_batch_size; @@ -261,9 +256,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } - AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? - getUnfusedAttentionType(attention_type_) : - attention_type_; + AttentionType attention_type = attention_type_; const bool is_unpadded_mha = isUnPaddedMHA(attention_type); for (int ite = 0; ite < iteration_num; ite++) { @@ -309,23 +302,19 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; - ftNcclRecv(layer_input, - data_size, - pipeline_para_.rank_ - 1, - pipeline_para_, - stream_); + std::cout << __FILE__ << ":" << __LINE__ << "\n"; + std::cout << "Recv: " << layer_output << "," << data_size << "\n"; + ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); } - invokeGeneralLayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, - layernorm_eps_, - h_token_num, - hidden_units_, - (float*)nullptr, - 0, - stream_); + invokeGeneralLLaMALayerNorm(decoder_normed_input_, + layer_input, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + stream_); sync_check_cuda_error(); TensorMap self_attention_input_tensors{ @@ -339,19 +328,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; - self_attention_input_tensors.insertIfValid( - "d_prefix_prompt_batch", - Tensor{MEMORY_GPU, - data_type, - {(size_t)local_batch_size}, - d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr}); - self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths", - Tensor{MEMORY_GPU, - TYPE_INT32, - {(size_t)local_batch_size}, - d_prefix_prompt_lengths != nullptr ? - d_prefix_prompt_lengths + ite * local_batch_size : - nullptr}); if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", @@ -383,51 +359,48 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (is_final == false) { invokeGeneralAddBiasResidualPreLayerNorm( - self_attn_output_, - decoder_normed_input_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, - layernorm_eps_, - h_token_num, - hidden_units_, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - 0, - stream_); + self_attn_output_, + decoder_normed_input_, + self_attn_output_, + layer_input, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + 0, + stream_); TensorMap ffn_input_tensors( {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); - TensorMap ffn_output_tensors({{"ffn_output", - Tensor{MEMORY_GPU, - data_type, - {h_token_num, (size_t)hidden_units_}, - layer_output}}}); + TensorMap ffn_output_tensors( + {{"ffn_output", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); ffn_layer_->forward( &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, - hidden_units_, - stream_); + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); sync_check_cuda_error(); if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; - ftNcclSend(layer_output, - data_size, - pipeline_para_.rank_ + 1, - pipeline_para_, - stream_); + std::cout << __FILE__ << ":" << __LINE__ << "\n"; + std::cout << "Send: " << layer_output << "," << data_size << "\n"; + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); + std::cout << __FILE__ << ":" << __LINE__ << "\n"; } if ((l == num_layer_ - 1) && is_unpadded_mha) { diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index c9c474e49..115b3b06b 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -81,32 +81,32 @@ class LLaMAContextDecoder: public BaseLayer { public: LLaMAContextDecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - bool neox_rotary_style, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - AttentionType attention_type = AttentionType::FUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce_ = 0); + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + float layernorm_eps, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type = AttentionType::FUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce_ = 0); LLaMAContextDecoder(LLaMAContextDecoder const& decoder); ~LLaMAContextDecoder(); - void forward(std::vector* output_tensors, - const std::vector* input_tensors, + void forward(std::vector* output_tensors, + const std::vector* input_tensors, const std::vector*>* decoder_layer_weights); - void forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, const std::vector*>* llama_decoder_layer_weight); }; diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc index 051744693..a98cd0159 100644 --- a/src/fastertransformer/models/llama/LLaMADecoder.cc +++ b/src/fastertransformer/models/llama/LLaMADecoder.cc @@ -38,8 +38,8 @@ void LLaMADecoder::initialize() false, 0); - ffn_layer_ = new GeluFfnLayer(0, // max_batch_size - 1, + ffn_layer_ = new GeluFfnLayer(0, // max_batch_size + 1, head_num_, size_per_head_, 0, // expert_num @@ -51,7 +51,7 @@ void LLaMADecoder::initialize() false, 0, false // use_gated_activation = false - ); + ); } template @@ -117,19 +117,19 @@ int LLaMADecoder::getFirstLayerParallelId() template LLaMADecoder::LLaMADecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - bool neox_rotary_style, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + float layernorm_eps, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), head_num_(head_num), size_per_head_(size_per_head), @@ -173,17 +173,17 @@ LLaMADecoder::~LLaMADecoder() } template -void LLaMADecoder::forward(std::vector* output_tensors, - const std::vector* input_tensors, - const std::vector*>* llama_decoder_layer_weight) +void LLaMADecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* llama_decoder_layer_weight) { FT_CHECK(false); } template -void LLaMADecoder::forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const std::vector*>* llama_decoder_layer_weight) +void LLaMADecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* llama_decoder_layer_weight) { // input tensors: // decoder_input [local_batch_size, hidden_dimension], @@ -191,8 +191,6 @@ void LLaMADecoder::forward(std::unordered_map* // sequence_lengths [local_batch_size] // total_padding_tokens [local_batch_size], // max_input_length [1] on cpu - // d_prefix_prompt_lengths [local_batch_size], on GPU - // max_prefix_prompt_length [1] on cpu // step [1] on cpu // ite [1] on cpu // cache_indirection [local_batch_size / beam_width, beam_width, memory_len] @@ -241,23 +239,17 @@ void LLaMADecoder::forward(std::unordered_map* // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, // stream_); - ftNcclRecv(layer_input, - data_size, - pipeline_para_.rank_ - 1, - pipeline_para_, - stream_); + ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); } - invokeGeneralLayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, - layernorm_eps_, - local_batch_size, - hidden_units_, - (float*)nullptr, - 0, - stream_); + invokeGeneralLLaMALayerNorm(decoder_normed_input_, + layer_input, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + local_batch_size, + hidden_units_, + stream_); sync_check_cuda_error(); TensorMap self_attention_input_tensors(*input_tensors); @@ -280,53 +272,46 @@ void LLaMADecoder::forward(std::unordered_map* {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; self_attention_layer_->forward(&self_attention_output_tensors, - &self_attention_input_tensors, - &llama_decoder_layer_weight->at(l)->self_attention_weights); + &self_attention_input_tensors, + &llama_decoder_layer_weight->at(l)->self_attention_weights); invokeGeneralAddBiasResidualPreLayerNorm( - self_attn_output_, - decoder_normed_input_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, - layernorm_eps_, - local_batch_size, - hidden_units_, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - 0, - stream_); + self_attn_output_, + decoder_normed_input_, + self_attn_output_, + layer_input, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + local_batch_size, + hidden_units_, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + 0, + stream_); TensorMap ffn_input_tensors( {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}}); - TensorMap ffn_output_tensors({{"ffn_output", - Tensor{MEMORY_GPU, - data_type, - {local_batch_size, hidden_units_}, - layer_output}}}); + TensorMap ffn_output_tensors( + {{"ffn_output", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, layer_output}}}); ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - local_batch_size, - hidden_units_, - stream_); + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + stream_); sync_check_cuda_error(); if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = local_batch_size * hidden_units_; - ftNcclSend(layer_output, - data_size, - pipeline_para_.rank_ + 1, - pipeline_para_, - stream_); + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); } } diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc index 412a1d076..3c40613fc 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -20,14 +20,8 @@ namespace fastertransformer { template -LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const int hidden_units, - const int inter_size, - const int tensor_para_size, - const int tensor_para_rank): - hidden_units_(hidden_units), - inter_size_(inter_size), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) +LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size): + hidden_units_(hidden_units), inter_size_(inter_size) { mallocWeights(); setWeightPtr(); @@ -37,7 +31,7 @@ template LLaMADecoderLayerWeight::~LLaMADecoderLayerWeight() { if (is_maintain_buffer == true) { - for (int i = 0; i < 12; i++) { + for (int i = 0; i < 14; i++) { if (i != attention_dense_bias_weight_id) { cudaFree(weights_ptr[i]); } @@ -62,50 +56,48 @@ LLaMADecoderLayerWeight::~LLaMADecoderLayerWeight() template LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other): - hidden_units_(other.hidden_units_), - inter_size_(other.inter_size_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_) + hidden_units_(other.hidden_units_), inter_size_(other.inter_size_) { mallocWeights(); - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); - cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); nullptr; + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); + //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); + //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); + //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); + //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); + //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); setWeightPtr(); } template LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADecoderLayerWeight& other) { - hidden_units_ = other.hidden_units_; - inter_size_ = other.inter_size_; - tensor_para_size_ = other.tensor_para_size_; - tensor_para_rank_ = other.tensor_para_rank_; + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; mallocWeights(); - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); - cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); + //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); + //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); + //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); + //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); + //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); setWeightPtr(); return *this; } @@ -114,85 +106,99 @@ template void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) { FT_CHECK(is_maintain_buffer == true); - const std::string rank_spec = std::to_string(tensor_para_rank_); +// loadWeightFromBin( +// weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type); loadWeightFromBin( - weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type); - loadWeightFromBin( - weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[2], - {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, - dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", - model_file_type); + weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[3], - {(size_t)(3 * hidden_units_ / tensor_para_size_)}, - dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin", + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_)}, + dir_path + ".attention.query_key_value.weight.bin", model_file_type); +// loadWeightFromBin(weights_ptr[3], +// {(size_t)(3 * hidden_units_)}, +// dir_path + ".attention.query_key_value.bias.bin", +// model_file_type); loadWeightFromBin(weights_ptr[4], - {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, - dir_path + ".attention.dense.weight." + rank_spec + ".bin", + {(size_t)(hidden_units_), (size_t)hidden_units_}, + dir_path + ".attention.wo.weight.bin", model_file_type); - - loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type); +// loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[6], - {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", - model_file_type); - loadWeightFromBin(weights_ptr[7], - {(size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin", + {(size_t)hidden_units_, (size_t)(inter_size_)}, + dir_path + ".feed_forward.w1.weight.bin", model_file_type); +// loadWeightFromBin( +// weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[8], - {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, - dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", + {(size_t)(inter_size_), (size_t)hidden_units_}, + dir_path + ".feed_forward.w2.weight.bin", model_file_type); - loadWeightFromBin( - weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type); - loadWeightFromBin( - weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type); - loadWeightFromBin( - weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); +// loadWeightFromBin( +// weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type); + + loadWeightFromBin(weights_ptr[10], + {(size_t)hidden_units_, (size_t)(inter_size_)}, + dir_path + ".feed_forward.w3.weight.bin", + model_file_type); +// loadWeightFromBin( +// weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type); + +// loadWeightFromBin(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type); } template void LLaMADecoderLayerWeight::setWeightPtr() { - pre_layernorm_weights.beta = weights_ptr[0]; + //pre_layernorm_weights.beta = weights_ptr[0]; + pre_layernorm_weights.beta = nullptr; pre_layernorm_weights.gamma = weights_ptr[1]; self_attention_weights.query_weight.kernel = weights_ptr[2]; - self_attention_weights.query_weight.bias = weights_ptr[3]; + //self_attention_weights.query_weight.bias = weights_ptr[3]; + self_attention_weights.query_weight.bias = nullptr; self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; - self_attention_weights.attention_output_weight.bias = weights_ptr[5]; + //self_attention_weights.attention_output_weight.bias = weights_ptr[5]; + self_attention_weights.attention_output_weight.bias = nullptr; ffn_weights.intermediate_weight.kernel = weights_ptr[6]; - ffn_weights.intermediate_weight.bias = weights_ptr[7]; + //ffn_weights.intermediate_weight.bias = weights_ptr[7]; + ffn_weights.intermediate_weight.bias = nullptr; ffn_weights.output_weight.kernel = weights_ptr[8]; - ffn_weights.output_weight.bias = weights_ptr[9]; - - post_attention_layernorm_weights.beta = weights_ptr[10]; - post_attention_layernorm_weights.gamma = weights_ptr[11]; + //ffn_weights.output_weight.bias = weights_ptr[9]; + ffn_weights.output_weight.bias = nullptr; + ffn_weights.gating_weight.kernel = weights_ptr[10]; + //ffn_weights.gating_weight.bias = weights_ptr[11]; + ffn_weights.gating_weight.bias = nullptr; + + //post_attention_layernorm_weights.beta = weights_ptr[12]; + post_attention_layernorm_weights.beta = nullptr; + post_attention_layernorm_weights.gamma = weights_ptr[13]; is_maintain_buffer = true; } template void LLaMADecoderLayerWeight::mallocWeights() { - deviceMalloc(&weights_ptr[0], hidden_units_); + //deviceMalloc(&weights_ptr[0], hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); - deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - deviceMalloc(&weights_ptr[5], hidden_units_); - - deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); - deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); - deviceMalloc(&weights_ptr[9], hidden_units_); - deviceMalloc(&weights_ptr[10], hidden_units_); - deviceMalloc(&weights_ptr[11], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_); + //deviceMalloc(&weights_ptr[3], 3 * hidden_units_); + deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_); + //deviceMalloc(&weights_ptr[5], hidden_units_); + + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_); + //deviceMalloc(&weights_ptr[7], inter_size_); + deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_); + //deviceMalloc(&weights_ptr[9], hidden_units_); + deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_); + //deviceMalloc(&weights_ptr[11], inter_size_); + //deviceMalloc(&weights_ptr[12], hidden_units_); + deviceMalloc(&weights_ptr[13], hidden_units_); } template struct LLaMADecoderLayerWeight; diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h index 4a6fc6a22..35d16300f 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h @@ -29,10 +29,7 @@ template struct LLaMADecoderLayerWeight { public: LLaMADecoderLayerWeight() = default; - LLaMADecoderLayerWeight(const int hidden_units, - const int inter_size, - const int tensor_para_size = 1, - const int tensor_para_rank = 0); + LLaMADecoderLayerWeight(const int hidden_units, const int inter_size); ~LLaMADecoderLayerWeight(); LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other); LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other); @@ -47,11 +44,9 @@ struct LLaMADecoderLayerWeight { private: int hidden_units_; int inter_size_; - int tensor_para_size_; - int tensor_para_rank_; const int attention_dense_bias_weight_id = 5; bool is_maintain_buffer = false; - T* weights_ptr[12]; + T* weights_ptr[14]; void setWeightPtr(); void mallocWeights(); diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index f0bdc282f..81a22a51d 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -19,20 +19,16 @@ namespace fastertransformer { template -LLaMAWeight::LLaMAWeight(const int hidden_units, - const int inter_size, - const int vocab_size, - const int num_layer, - const int tensor_para_size, - const int tensor_para_rank, - const int layer_para_size, - const int layer_para_rank): +LLaMAWeight::LLaMAWeight(const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int layer_para_size, + const int layer_para_rank): hidden_units_(hidden_units), inter_size_(inter_size), vocab_size_(vocab_size), num_layer_(num_layer), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank), layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank) { @@ -41,8 +37,7 @@ LLaMAWeight::LLaMAWeight(const int hidden_un decoder_layer_weights.reserve(num_layer_); for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { - decoder_layer_weights.push_back(new LLaMADecoderLayerWeight( - hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_)); + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight(hidden_units_, inter_size_)); } else { // Layer-parallelism: allocate empty layer because @@ -77,15 +72,13 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): inter_size_(other.inter_size_), vocab_size_(other.vocab_size_), num_layer_(other.num_layer_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_), layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), prompt_token_weight_size_(other.prompt_token_weight_size_) { mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); @@ -102,19 +95,17 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): template LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) { - hidden_units_ = other.hidden_units_; - inter_size_ = other.inter_size_; - vocab_size_ = other.vocab_size_; - num_layer_ = other.num_layer_; - tensor_para_size_ = other.tensor_para_size_; - tensor_para_rank_ = other.tensor_para_rank_; - layer_para_size_ = other.layer_para_size_; - layer_para_rank_ = other.layer_para_rank_; - prompt_token_weight_size_ = other.prompt_token_weight_size_; + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + vocab_size_ = other.vocab_size_; + num_layer_ = other.num_layer_; + layer_para_size_ = other.layer_para_size_; + layer_para_rank_ = other.layer_para_rank_; + prompt_token_weight_size_ = other.prompt_token_weight_size_; mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); @@ -132,7 +123,8 @@ template void LLaMAWeight::setWeightPtr() { pre_decoder_embedding_table = weights_ptr[0]; - post_decoder_layernorm.beta = weights_ptr[1]; + //post_decoder_layernorm.beta = weights_ptr[1]; + post_decoder_layernorm.beta = nullptr; post_decoder_layernorm.gamma = weights_ptr[2]; post_decoder_embedding.kernel = weights_ptr[3]; } @@ -143,7 +135,7 @@ void LLaMAWeight::mallocWeights() weights_ptr.resize(num_base_weights); deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); - deviceMalloc(&weights_ptr[1], hidden_units_); + //deviceMalloc(&weights_ptr[1], hidden_units_); deviceMalloc(&weights_ptr[2], hidden_units_); deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); @@ -156,16 +148,15 @@ void LLaMAWeight::loadModel(std::string dir_path) FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama"); FT_CHECK(is_maintain_buffer == true); - loadWeightFromBin( - weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type); - loadWeightFromBin( - weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type); - - loadWeightFromBin( - weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type); + loadWeightFromBin(weights_ptr[0], + {(size_t)(vocab_size_ * hidden_units_)}, + dir_path + "/model.tok_embeddings.weight.bin", + model_file_type); + //loadWeightFromBin(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type); loadWeightFromBin(weights_ptr[3], {(size_t)(vocab_size_ * hidden_units_)}, - dir_path + "/model.lm_head.weight.bin", + dir_path + "/model.output.weight.bin", model_file_type); for (int l = 0; l < num_layer_; l++) { diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h index b372139e2..e1fed4309 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.h +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -32,8 +32,6 @@ struct LLaMAWeight { const int inter_size, const int vocab_size, const int num_layer, - const int tensor_para_size = 1, - const int tensor_para_rank = 0, const int layer_para_size = 1, const int layer_para_rank = 0); @@ -62,8 +60,6 @@ struct LLaMAWeight { int vocab_size_; int num_layer_; - int tensor_para_size_; - int tensor_para_rank_; int layer_para_size_; int layer_para_rank_; From dbe0657d5f504418261a0e84759dc95e98d45262 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 15 Sep 2023 08:55:49 +0000 Subject: [PATCH 09/55] remove debug code and bug fix --- src/fastertransformer/models/llama/LLaMA.cc | 7 ------- src/fastertransformer/models/llama/LLaMAContextDecoder.cc | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 9fcab580b..d03183a0a 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -402,11 +402,9 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); - std::cout << __FILE__ << ":" << __LINE__ << "\n"; // handle first step if (max_input_length > 1) { - std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, input_tensors->at("input_ids").getPtr(), @@ -464,11 +462,9 @@ void LLaMA::forward(std::unordered_map* output_ten {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; - std::cout << __FILE__ << ":" << __LINE__ << "\n"; llama_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -526,7 +522,6 @@ void LLaMA::forward(std::unordered_map* output_ten cudaMemcpyDeviceToDevice, stream_); } - std::cout << __FILE__ << ":" << __LINE__ << "\n"; invokeMaskPaddingTokens(masked_tokens_, input_tensors->at("input_lengths").getPtr(), // not_tiled @@ -539,7 +534,6 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); for (int step = max_input_length; step < (int)max_output_seq_len; step++) { - std::cout << __FILE__ << ":" << __LINE__ << "\n"; const int src_indir_idx = (step - max_input_length) % 2; const int tgt_indir_idx = 1 - src_indir_idx; @@ -764,7 +758,6 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); } } - std::cout << __FILE__ << ":" << __LINE__ << "\n"; setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index e8f4a4e21..781338253 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -42,7 +42,7 @@ void LLaMAContextDecoder::initialize() 0); ffn_layer_ = new GeluFfnLayer(0, // max_batch_size - 1, + 0, head_num_, size_per_head_, 0, // expert_num From 29c7b690d7973865036593b14aa49b9716d1b711 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 15 Sep 2023 16:24:35 +0000 Subject: [PATCH 10/55] only contextdecoder is necessary --- .../layers/attention_layers/CMakeLists.txt | 5 + .../LLaMAContextAttentionLayer.cc | 622 ++++++++++++++++++ .../LLaMAContextAttentionLayer.h | 131 ++++ .../models/llama/CMakeLists.txt | 20 +- src/fastertransformer/models/llama/LLaMA.cc | 262 +------- src/fastertransformer/models/llama/LLaMA.h | 4 - .../models/llama/LLaMAContextDecoder.cc | 30 +- .../models/llama/LLaMADecoder.cc | 326 --------- .../models/llama/LLaMADecoder.h | 100 --- .../models/llama/LLaMADecoderLayerWeight.cc | 93 ++- .../models/llama/LLaMAWeight.cc | 10 +- 11 files changed, 824 insertions(+), 779 deletions(-) create mode 100644 src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc create mode 100644 src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h delete mode 100644 src/fastertransformer/models/llama/LLaMADecoder.cc delete mode 100644 src/fastertransformer/models/llama/LLaMADecoder.h diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt index 1f0e93b1b..13821892d 100644 --- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt +++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt @@ -44,6 +44,11 @@ set_property(TARGET GptContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils) +add_library(LLaMAContextAttentionLayer STATIC LLaMAContextAttentionLayer.cc) +set_property(TARGET LLaMAContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils) + add_library(DisentangledAttentionLayer STATIC DisentangledAttentionLayer.cc) set_property(TARGET DisentangledAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET DisentangledAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc new file mode 100644 index 000000000..38ec79b47 --- /dev/null +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -0,0 +1,622 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/utils/nvtx_utils.h" + +namespace fastertransformer { + +template +void LLaMAContextAttentionLayer::forward(TensorMap* output_tensors, + TensorMap* input_tensors, + const AttentionWeight* attention_weights) +{ + // input_tensors: + // input_query [token_num, hidden_dimension] + // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] + // attention_type [1] + // is_final_layer [1], bool on cpu + // layer_id [1], int on cpu + // padding_offset, int, [token_num] (optional) + // cu_seqlens, int, [batch_size] (optional) + // d_prefix_prompt_batch [global_batch_size], (optional) + // each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head] + // d_prefix_prompt_lengths [batch_size], int (optional) + // linear_bias_slopes [head_num] (optional) + + // output_tensors: + // hidden_features [token_num, hidden_dimension] + // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] + // value_cache [batch, local_head_num, max_seq_len, size_per_head] + FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + FT_CHECK(output_tensors->at("key_cache").shape.size() == 5); + FT_CHECK(output_tensors->at("value_cache").shape.size() == 4 + || output_tensors->at("value_cache").shape.size() == 3); + const int request_batch_size = input_tensors->at("attention_mask").shape[0]; + const int request_seq_len = input_tensors->at("attention_mask").shape[2]; + const int max_prompt_length = + input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; + const int layer_id = input_tensors->getVal("layer_id"); + const T** d_prefix_prompt_batch = input_tensors->getPtr("d_prefix_prompt_batch", nullptr); + const int* d_prefix_prompt_lengths = input_tensors->getPtr("d_prefix_prompt_lengths", nullptr); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + T* linear_bias_slopes = input_tensors->getPtr("linear_bias_slopes", nullptr); + /* float* attention_query_dynamic_scale = input_tensors->getPtr("attention_query_dynamic_scale", + * nullptr); */ + + T* attention_out = output_tensors->at("hidden_features").getPtr(); + T* attention_input = input_tensors->at("input_query").getPtr(); + T* attention_mask = input_tensors->at("attention_mask").getPtr(); + + const AttentionType attention_type = input_tensors->getVal("attention_type"); + FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA, + "LLaMA Context FUSED_PADDED_MHA is not supported !"); + + PUSH_RANGE("attention buffer alloc"); + allocateBuffer(request_batch_size, request_seq_len + max_prompt_length, attention_type != AttentionType::FUSED_MHA); + POP_RANGE; + sync_check_cuda_error(); + + const bool is_final = input_tensors->at("is_final_layer").getVal(); + + const int m = input_tensors->at("input_query").shape[0]; + + PUSH_RANGE("qkv_gemm"); + +#ifdef SPARSITY_ENABLED + const int m_padded = 8 * div_up(m, 8); + bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, 3 * local_hidden_units_, m_padded, hidden_units_); +#else + constexpr bool use_sparse = false; +#endif + + if (use_sparse) { +#ifdef SPARSITY_ENABLED + cublas_wrapper_->SpGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + 3 * local_hidden_units_, + m_padded, + hidden_units_, + attention_weights->query_weight.sp_kernel, + attention_input, + qkv_buf_); +#endif + } + else if (int8_mode_ == 1) { + FT_CHECK(weight_only_int8_fc_runner_.get() != NULL && attention_weights->query_weight.int8_kernel != NULL + && attention_weights->query_weight.weight_only_quant_scale != NULL); + + weight_only_int8_fc_runner_->gemm(attention_input, + reinterpret_cast(attention_weights->query_weight.int8_kernel), + attention_weights->query_weight.weight_only_quant_scale, + qkv_buf_, + m, + 3 * local_hidden_units_, + hidden_units_, + mixed_gemm_workspace_, + mixed_gemm_ws_bytes_, + stream_); + } + else if (int8_mode_ == 2) { + cublas_wrapper_->Int8Gemm(3 * local_hidden_units_, + m, + hidden_units_, + attention_weights->query_weight.int8_kernel, + hidden_units_, + input_tensors->at("input_query").getPtr(), + hidden_units_, + reinterpret_cast(qkv_buf_), + 3 * local_hidden_units_, + attention_weights->query_weight.scale_inter, + true); + } + else { + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + 3 * local_hidden_units_, // n + m, + hidden_units_, // k + attention_weights->query_weight.kernel, + 3 * local_hidden_units_, // n + attention_input, + hidden_units_, // k + qkv_buf_, + 3 * local_hidden_units_ /* n */); + } + + sync_check_cuda_error(); + + // IDEA: append prefix prompt key value here + PrefixPromptBatchWeightsParam param{d_prefix_prompt_batch, + d_prefix_prompt_lengths, + max_prompt_length, + (size_t)layer_id * 2 * local_head_num_ * size_per_head_}; + + if (padding_offset != nullptr) { + // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous + cudaMemsetAsync( + q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_); + } + invokeAddFusedQKVBiasTranspose(q_buf_2_, + k_buf_2_, + v_buf_2_, + param, // prefix prompt + qkv_buf_, + attention_weights->query_weight.bias, + padding_offset, + request_batch_size, + request_seq_len, + m, + local_head_num_, + size_per_head_, + rotary_embedding_dim_, + neox_rotary_style_, + attention_weights->query_weight.scale_out, + int8_mode_, + stream_); + sync_check_cuda_error(); + + const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length + // Use batch major + // put k/v_buf from shape [B, H, PL + L, Dh] + // to cache [B, H, Dh/x, PL + L, x] and [B, H, PL + L, Dh/x, x], PL denotes prompt length + invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), + output_tensors->getPtr("value_cache"), + k_buf_2_, + v_buf_2_, + request_batch_size, + max_prompt_length + request_seq_len, // max input length + prefix prompt length + max_seq_len, + size_per_head_, + local_head_num_, + stream_); + // IDEA : after this, k_cache = (batch_size, num_heads, Dh/x, prefix_prompt_len + L, x) + // k_cache = (batch_size, num_heads, prefix_prompt_len + L, Dh) + sync_check_cuda_error(); + + // TODO: fmha kernels doesn't support different seq lengths of q and kv + if (attention_type == AttentionType::FUSED_MHA) { + dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size); + dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); + } + // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh) + + POP_RANGE; + if (is_final == false) { + const cudaDataType_t gemm_data_type = getCudaDataType(); + const int attention_seq_len_1 = request_seq_len; // q length + const int attention_seq_len_2 = max_prompt_length + request_seq_len; // kv length + const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); + if (attention_type != AttentionType::FUSED_MHA) { + if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { + PUSH_RANGE("Q*K batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, // n + attention_seq_len_1, // m + size_per_head_, // k + 1.0f, + k_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_2 * size_per_head_, // n * k + q_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_1 * size_per_head_, // m * k + 0.0f, + qk_buf_float_, + CUDA_R_32F, + attention_seq_len_2, // n + attention_seq_len_2 * attention_seq_len_1, + request_batch_size * local_head_num_, // global batch size + CUDA_R_32F); + + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = request_batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = local_head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = const_cast(linear_bias_slopes); // (head_num,), optional + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; + } + else { + PUSH_RANGE("Q*K batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, + attention_seq_len_1, + size_per_head_, + k_buf_2_, + size_per_head_, + attention_seq_len_2 * size_per_head_, + q_buf_2_, + size_per_head_, + attention_seq_len_1 * size_per_head_, + qk_buf_, + attention_seq_len_2, + attention_seq_len_2 * attention_seq_len_1, + request_batch_size * local_head_num_); + + POP_RANGE; + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = request_batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = local_head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = const_cast(linear_bias_slopes); // (head_num,), optional + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; + } + + PUSH_RANGE("QK*V batch gemm"); + + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + size_per_head_, + attention_seq_len_1, + attention_seq_len_2, + v_buf_2_, + size_per_head_, + attention_seq_len_2 * size_per_head_, + qk_buf_, + attention_seq_len_2, + attention_seq_len_1 * attention_seq_len_2, + qkv_buf_2_, + size_per_head_, + attention_seq_len_1 * size_per_head_, + request_batch_size * local_head_num_); + + // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) + if (padding_offset == nullptr) { + invokeTransposeQKV(qkv_buf_3_, + qkv_buf_2_, + request_batch_size, + attention_seq_len_1, + local_head_num_, + size_per_head_, + attention_weights->attention_output_weight.scale, + int8_mode_, + stream_); + sync_check_cuda_error(); + } + else { + invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, + qkv_buf_3_, + m, + request_batch_size, + attention_seq_len_1, + local_head_num_, + size_per_head_, + padding_offset, + attention_weights->attention_output_weight.scale, + int8_mode_, + stream_); + } + POP_RANGE; + } + sync_check_cuda_error(); + + PUSH_RANGE("proj gemm"); +#ifdef SPARSITY_ENABLED + bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m_padded, local_hidden_units_); +#endif + + if (use_sparse) { +#ifdef SPARSITY_ENABLED + cublas_wrapper_->SpGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + hidden_units_, + m_padded, + local_hidden_units_, + attention_weights->attention_output_weight.sp_kernel, + qkv_buf_3_, + attention_out); +#endif + } + else { + if (int8_mode_ == 1) { + FT_CHECK(weight_only_int8_fc_runner_.get() != NULL + && attention_weights->attention_output_weight.int8_kernel != NULL + && attention_weights->attention_output_weight.weight_only_quant_scale != NULL); + + weight_only_int8_fc_runner_->gemm( + qkv_buf_3_, + reinterpret_cast(attention_weights->attention_output_weight.int8_kernel), + attention_weights->attention_output_weight.weight_only_quant_scale, + attention_out, + m, + hidden_units_, + local_hidden_units_, + mixed_gemm_workspace_, + mixed_gemm_ws_bytes_, + stream_); + } + else if (int8_mode_ == 2) { + int8_fc_runner_->gemm(reinterpret_cast(qkv_buf_3_), + attention_weights->attention_output_weight.int8_kernel, + QuantMode::PerTensorQuant, + attention_weights->attention_output_weight.scale_inter, + attention_weights->attention_output_weight.scale_out, + output_tensors->at("hidden_features").getPtr(), + m, + hidden_units_, + local_hidden_units_, + nullptr, + 0, + stream_); + } + else { + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + hidden_units_, + m, + local_hidden_units_, + attention_weights->attention_output_weight.kernel, + hidden_units_, + qkv_buf_3_, + local_hidden_units_, + attention_out, + hidden_units_); + } + } + POP_RANGE; + } + + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } + sync_check_cuda_error(); + FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse, + int int8_mode): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + max_batch_size_(max_batch_size), + max_seq_len_(max_seq_len), + head_num_(head_num), + size_per_head_(size_per_head), + hidden_units_(head_num * size_per_head), + local_head_num_(head_num), + local_hidden_units_(local_head_num_ * size_per_head), + rotary_embedding_dim_(0), + neox_rotary_style_(false), + is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), + weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), + int8_mode_(int8_mode) +{ +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse, + int int8_mode): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + max_batch_size_(max_batch_size), + max_seq_len_(max_seq_len), + head_num_(head_num), + size_per_head_(size_per_head), + hidden_units_(head_num * size_per_head), + local_head_num_(local_head_num), + local_hidden_units_(local_head_num_ * size_per_head), + rotary_embedding_dim_(0), + neox_rotary_style_(false), + is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), + weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), + int8_mode_(int8_mode) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f)); +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + size_t rotary_embedding_dim, + bool neox_rotary_style, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse, + int int8_mode): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + max_batch_size_(max_batch_size), + max_seq_len_(max_seq_len), + head_num_(head_num), + size_per_head_(size_per_head), + hidden_units_(head_num * size_per_head), + local_head_num_(local_head_num), + local_hidden_units_(local_head_num_ * size_per_head), + rotary_embedding_dim_(rotary_embedding_dim), + neox_rotary_style_(neox_rotary_style), + is_qk_buf_float_(is_qk_buf_float), + weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), + int8_mode_(int8_mode) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f)); +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer): + BaseAttentionLayer(attention_layer.stream_, + attention_layer.cublas_wrapper_, + attention_layer.allocator_, + attention_layer.is_free_buffer_after_forward_, + attention_layer.sparse_), + max_batch_size_(attention_layer.max_batch_size_), + max_seq_len_(attention_layer.max_seq_len_), + head_num_(attention_layer.head_num_), + size_per_head_(attention_layer.size_per_head_), + hidden_units_(attention_layer.hidden_units_), + local_head_num_(attention_layer.local_head_num_), + local_hidden_units_(attention_layer.local_hidden_units_), + rotary_embedding_dim_(attention_layer.rotary_embedding_dim_), + neox_rotary_style_(attention_layer.neox_rotary_style_), + is_qk_buf_float_(attention_layer.is_qk_buf_float_), + weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_), + int8_fc_runner_(attention_layer.int8_fc_runner_), + int8_mode_(attention_layer.int8_mode_) +{ +} + +template +LLaMAContextAttentionLayer::~LLaMAContextAttentionLayer() +{ + cublas_wrapper_ = nullptr; + freeBuffer(); +} + +template +void LLaMAContextAttentionLayer::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T); + // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels. + const auto type_size = sizeof(T); + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true); + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true); + k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_; + v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_; + + // save memory usage when using fmha + if (allocate_qk_buf) { + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len, true); + } + else { + allocator_->free((void**)(&qk_buf_)); + } + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * local_hidden_units_, true); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, type_size * batch_size * seq_len * local_hidden_units_, true); + + if (is_qk_buf_float_ == true) { + if (allocate_qk_buf) { + qk_buf_float_ = (float*)allocator_->reMalloc( + qk_buf_float_, sizeof(float) * batch_size * local_head_num_ * seq_len * seq_len, true); + } + else { + allocator_->free((void**)(&qk_buf_float_)); + } + } + + if (int8_mode_ == 1) { + // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max + // possible memory that would be required by any of the individual gemms. + const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); + mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); + mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false); + } + + if (int8_mode_ == 1) { + // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max + // possible memory that would be required by any of the individual gemms. + const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); + mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); + mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false); + } + else if (int8_mode_ == 2) { + const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); + int8_gemm_ws_bytes_ = int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); + int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false); + } + is_allocate_buffer_ = true; +} + +template +void LLaMAContextAttentionLayer::freeBuffer() +{ + if (is_allocate_buffer_) { + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + allocator_->free((void**)(&qkv_buf_)); + allocator_->free((void**)(&q_buf_2_)); + allocator_->free((void**)(&qk_buf_)); + allocator_->free((void**)(&qkv_buf_2_)); + allocator_->free((void**)(&qkv_buf_3_)); + + if (is_qk_buf_float_ == true) { + allocator_->free((void**)(&qk_buf_float_)); + } + + allocator_->free((void**)(&mixed_gemm_workspace_)); + mixed_gemm_ws_bytes_ = 0; + + allocator_->free((void**)(&int8_gemm_workspace_)); + int8_gemm_ws_bytes_ = 0; + + is_allocate_buffer_ = false; + } +} + +template class LLaMAContextAttentionLayer; +template class LLaMAContextAttentionLayer; +#ifdef ENABLE_BF16 +template class LLaMAContextAttentionLayer<__nv_bfloat16>; +#endif + +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h new file mode 100644 index 000000000..6a18d734e --- /dev/null +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "3rdparty/trt_fused_multihead_attention/qkvToContext.h" +#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" +#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" + +namespace fastertransformer { + +template +class LLaMAContextAttentionLayer: public BaseAttentionLayer { +private: + // buffer handling + size_t max_batch_size_ = 0; + size_t max_seq_len_ = 0; + + // metadata + const size_t head_num_; + const size_t size_per_head_; + const size_t hidden_units_; + const size_t local_head_num_; + const size_t local_hidden_units_; + const size_t rotary_embedding_dim_; + const bool neox_rotary_style_; + + // fmha runner + int sm_ = getSMVersion(); + std::unique_ptr dispatcher_fp16; + + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf); + void freeBuffer() override; + + using BaseAttentionLayer::is_free_buffer_after_forward_; + using BaseAttentionLayer::is_allocate_buffer_; + using BaseAttentionLayer::cublas_wrapper_; + + bool is_qk_buf_float_; + + std::shared_ptr> weight_only_int8_fc_runner_; + std::shared_ptr> int8_fc_runner_; + +protected: + using BaseAttentionLayer::allocator_; + using BaseAttentionLayer::stream_; + using BaseAttentionLayer::sparse_; + T* qkv_buf_ = nullptr; + T* q_buf_2_ = nullptr; + T* k_buf_2_ = nullptr; + T* v_buf_2_ = nullptr; + T* qk_buf_ = nullptr; + float* qk_buf_float_ = nullptr; + T* qkv_buf_2_ = nullptr; + T* qkv_buf_3_ = nullptr; + char* mixed_gemm_workspace_ = nullptr; + size_t mixed_gemm_ws_bytes_ = 0; + char* int8_gemm_workspace_ = nullptr; + size_t int8_gemm_ws_bytes_ = 0; + + // int8_mode_ == 0 means we don't use any mechanism related to INT8. + // int8_mode_ == 1 for weight quantized only gemm for GPT + // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales) + const int int8_mode_ = 0; + +public: + LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); + + LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); + + LLaMAContextAttentionLayer(size_t max_batch_size, + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + size_t rotary_embedding_dim, + bool neox_rotary_style_, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); + + LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer); + + virtual ~LLaMAContextAttentionLayer(); + + void + forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight* attention_weights) override; +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt index da314ec7d..0c5106f00 100644 --- a/src/fastertransformer/models/llama/CMakeLists.txt +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -19,26 +19,12 @@ set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger) -add_library(LLaMADecoder STATIC LLaMADecoder.cc) -set_property(TARGET LLaMADecoder PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET LLaMADecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(LLaMADecoder PUBLIC -lcudart cublasMMWrapper - TensorParallelDecoderSelfAttentionLayer - TensorParallelGeluFfnLayer - layernorm_kernels - add_residual_kernels - LLaMADecoderLayerWeight - tensor - nccl_utils - cuda_utils - logger) - add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc) set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper - TensorParallelGptContextAttentionLayer - TensorParallelGeluFfnLayer + LLaMAContextAttentionLayer + FfnLayer layernorm_kernels add_residual_kernels gpt_kernels @@ -56,11 +42,9 @@ add_library(LLaMA STATIC LLaMA.cc) set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LLaMA PUBLIC -lcudart - LLaMADecoder LLaMAContextDecoder decoding_kernels gpt_kernels - DynamicDecodeLayer BaseBeamSearchLayer bert_preprocess_kernels tensor diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index d03183a0a..fb8eb4f9c 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -42,30 +42,6 @@ void LLaMA::initialize() attention_type_, custom_all_reduce_comm_, enable_custom_all_reduce_); - - llama_decoder_ = new LLaMADecoder(head_num_, - size_per_head_, - inter_size_, - num_layer_, - rotary_embedding_dim_, - neox_rotary_style_, - layernorm_eps_, - pipeline_para_, - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - custom_all_reduce_comm_, - enable_custom_all_reduce_); - - dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, - vocab_size_, - 0, // end_id, deprecated - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - cuda_device_prop_); } template @@ -273,8 +249,6 @@ LLaMA::LLaMA(LLaMA const& llama): template LLaMA::~LLaMA() { - delete llama_decoder_; - delete dynamic_decode_layer_; delete llama_context_decoder_; freeBuffer(); } @@ -373,15 +347,8 @@ void LLaMA::forward(std::unordered_map* output_ten setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); sync_check_cuda_error(); - { - TensorMap input_map(*input_tensors); - dynamic_decode_layer_->setup(batch_size, beam_width, &input_map); - handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size); - handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size); - } - - const DataType data_type = getTensorType(); + const DataType data_type = getTensorType(); const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, head_num_, @@ -402,7 +369,6 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); - // handle first step if (max_input_length > 1) { invokeTileGptInputs(tiled_input_ids_buf_, @@ -533,232 +499,6 @@ void LLaMA::forward(std::unordered_map* output_ten beam_width, stream_); - for (int step = max_input_length; step < (int)max_output_seq_len; step++) { - const int src_indir_idx = (step - max_input_length) % 2; - const int tgt_indir_idx = 1 - src_indir_idx; - - const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); - FT_CHECK(batch_size % local_batch_size == 0); - const size_t iteration_num = batch_size / local_batch_size; - *generation_should_stop_ = true; - - for (uint ite = 0; ite < iteration_num; ++ite) { - const int id_offset = ite * local_batch_size * beam_width; - const int hidden_units_offset = id_offset * hidden_units_; - const int vocab_size_units_offset = id_offset * vocab_size_; - - if (!(max_input_length > 1 && step == max_input_length)) { - if (pipeline_para_.rank_ == 0) { - invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - output_ids_buf_ + id_offset, - tiled_total_padding_count_ + id_offset, - local_batch_size * beam_width, - hidden_units_, - (T)(1.0f), - step - 1, - batch_size * beam_width, - 0, - stream_); - sync_check_cuda_error(); - } - std::unordered_map decoder_input_tensors{ - {"decoder_input", - Tensor{MEMORY_GPU, - data_type, - {local_batch_size * beam_width, hidden_units_}, - decoder_input_buf_ + hidden_units_offset}}, - {"finished", - Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}}, - {"sequence_lengths", - Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}}, - {"total_padding_tokens", - Tensor{MEMORY_GPU, - TYPE_INT32, - {local_batch_size * beam_width}, - tiled_total_padding_count_ + id_offset}}, - {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, - {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, - {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}, - {"cache_indirection", - Tensor{MEMORY_GPU, - TYPE_INT32, - {local_batch_size, beam_width, max_output_seq_len}, - beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len : - nullptr}}, - {"masked_tokens", - Tensor{MEMORY_GPU, - TYPE_BOOL, - {local_batch_size * beam_width, max_cache_seq_len}, - masked_tokens_ + id_offset * max_cache_seq_len}}}; - std::unordered_map decoder_output_tensors{ - {"decoder_output", - Tensor{MEMORY_GPU, - data_type, - {local_batch_size * beam_width, hidden_units_}, - decoder_output_buf_ + hidden_units_offset}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; - llama_decoder_->forward( - &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); - } - - if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { - invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_ + hidden_units_offset, - decoder_output_buf_ + hidden_units_offset, - llama_weights->post_decoder_layernorm.gamma, - llama_weights->post_decoder_layernorm.beta, - layernorm_eps_, - local_batch_size * beam_width, - hidden_units_, - stream_); - sync_check_cuda_error(); - - float alpha = 1.0f; - float beta = 0.0f; - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - vocab_size_, - local_batch_size * beam_width, - hidden_units_, // k - &alpha, - llama_weights->post_decoder_embedding.kernel, - gemm_data_type, - hidden_units_, // k - normed_decoder_output_buf_ + hidden_units_offset, - gemm_data_type, - hidden_units_, // k - &beta, - logits_buf_ + vocab_size_units_offset, - CUDA_R_32F, - vocab_size_, - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - - int tmp_local_batch_size = local_batch_size; - bool is_initialize_random_table = step == max_input_length; - std::unordered_map dynamic_decode_input_tensors{ - {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_}, logits_buf_}}, - // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_}, nullptr}}, - {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, - {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, - {"input_lengths", - Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}}, - {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}}, - {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, - {"src_cache_indirection", - Tensor{MEMORY_GPU, - TYPE_INT32, - {local_batch_size, beam_width, max_output_seq_len}, - cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}}, - {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}}, - {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}}, - {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}}; - - for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { - if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) { - dynamic_decode_input_tensors.insert(*t); - } - } - - // common outputs - bool subbatch_should_stop = false; - std::unordered_map dynamic_decode_output_tensors{ - {"output_ids", - Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}}, - {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}}, - // cum_log_probs is necessary for beam search, while it is optional for sampling. - {"cum_log_probs", - Tensor{MEMORY_GPU, - TYPE_FP32, - {batch_size * beam_width}, - ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ : - nullptr}}, - {"output_log_probs", - Tensor{MEMORY_GPU, - TYPE_FP32, - {max_seq_len, batch_size, beam_width}, - output_tensors->count("output_log_probs") > 0 - && output_tensors->at("output_log_probs").data != nullptr ? - output_log_probs_buf_ : - nullptr}}, - {"parent_ids", - Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}}, - {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}}, - {"tgt_cache_indirection", - Tensor{MEMORY_GPU, - TYPE_INT32, - {local_batch_size, beam_width, max_output_seq_len}, - cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}}, - {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}}; - - for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) { - // Handle exceptions. - if (t->first == "cum_log_probs" || t->first == "output_log_probs") { - continue; - } - dynamic_decode_output_tensors.insert(*t); - } - - dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); - *generation_should_stop_ &= subbatch_should_stop; - } - } - - if (pipeline_para_.world_size_ > 1) { - ftNcclGroupStart(); - ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width, - batch_size * beam_width, - pipeline_para_.world_size_ - 1, - pipeline_para_, - stream_); - - ftNcclBroadCast( - sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); - - ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); - - if (beam_width > 1) { - ftNcclBroadCast(cache_indirections_[tgt_indir_idx], - batch_size * beam_width * max_output_seq_len, - pipeline_para_.world_size_ - 1, - pipeline_para_, - stream_); - } - ftNcclGroupEnd(); - // throw errors when detected - NcclParam tensor_para(0, 1); - ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_); - sync_check_cuda_error(); - } - - if (*generation_should_stop_) { - break; - } - if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) { - setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); - sendTensorsToFirstPipelineNode(output_tensors, input_tensors); - - if (pipeline_para_.rank_ == 0) { - token_generated_cb_(output_tensors, token_generated_ctx_); - } - } - if (step == max_input_length) { - /* We have just finished processing input: update the padding count: - * total_padding_count += (max_input_length - input_lengths) - */ - invokeUpdatePaddingCount(tiled_total_padding_count_, - input_tensors->at("input_lengths").getPtr(), // not_tiled - (const int*)nullptr, - max_input_length, - 0, - batch_size, - beam_width, - stream_); - } - } - setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 7a66a2ebf..303236b72 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -19,9 +19,7 @@ #include #include -#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" -#include "src/fastertransformer/models/llama/LLaMADecoder.h" #include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" @@ -56,9 +54,7 @@ class LLaMA: public BaseLayer { const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); - LLaMADecoder* llama_decoder_; LLaMAContextDecoder* llama_context_decoder_; - DynamicDecodeLayer* dynamic_decode_layer_; void allocateBuffer() override; void allocateBuffer( diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 781338253..119c98041 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -19,27 +19,27 @@ #include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/layers/FfnLayer.h" -#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" +#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" namespace fastertransformer { template void LLaMAContextDecoder::initialize() { - self_attention_layer_ = new GptContextAttentionLayer(0, // max_batch_size - 0, // max_seq_len - head_num_, - size_per_head_, - head_num_, - rotary_embedding_dim_, - neox_rotary_style_, - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - is_qk_buf_float_, - false, - 0); + self_attention_layer_ = new LLaMAContextAttentionLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + head_num_, + rotary_embedding_dim_, + neox_rotary_style_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + is_qk_buf_float_, + false, + 0); ffn_layer_ = new GeluFfnLayer(0, // max_batch_size 0, diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc deleted file mode 100644 index a98cd0159..000000000 --- a/src/fastertransformer/models/llama/LLaMADecoder.cc +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/fastertransformer/models/llama/LLaMADecoder.h" -#include "src/fastertransformer/layers/FfnLayer.h" -#include "src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h" - -namespace fastertransformer { - -template -void LLaMADecoder::initialize() -{ - self_attention_layer_ = new DecoderSelfAttentionLayer(0, // max_batch_size - head_num_, - size_per_head_, - head_num_, - rotary_embedding_dim_, - neox_rotary_style_, - head_num_ * size_per_head_, - 1.0f, - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - false, - 0); - - ffn_layer_ = new GeluFfnLayer(0, // max_batch_size - 1, - head_num_, - size_per_head_, - 0, // expert_num - inter_size_, - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - false, - 0, - false // use_gated_activation = false - ); -} - -template -void LLaMADecoder::allocateBuffer() -{ - FT_CHECK(false); -} - -template -void LLaMADecoder::allocateBuffer(size_t batch_size) -{ - decoder_normed_input_ = reinterpret_cast( - allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false)); - self_attn_output_ = - reinterpret_cast(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false)); - ffn_output_ = - reinterpret_cast(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false)); - decoder_layer_output_ = reinterpret_cast( - allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false)); - is_allocate_buffer_ = true; -} - -template -void LLaMADecoder::freeBuffer() -{ - if (is_allocate_buffer_ == true) { - allocator_->free((void**)(&decoder_normed_input_)); - allocator_->free((void**)(&self_attn_output_)); - allocator_->free((void**)(&ffn_output_)); - allocator_->free((void**)(&decoder_layer_output_)); - is_allocate_buffer_ = false; - } -} - -template -bool LLaMADecoder::isValidLayerParallelId(uint l) -{ - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) - && (l < local_num_layer * (pipeline_para_.rank_ + 1)); -} - -template -bool LLaMADecoder::isFirstLayerParallelId(uint l) -{ - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); -} - -template -bool LLaMADecoder::isLastLayerParallelId(uint l) -{ - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); -} - -template -int LLaMADecoder::getFirstLayerParallelId() -{ - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return local_num_layer * pipeline_para_.rank_; -} - -template -LLaMADecoder::LLaMADecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - bool neox_rotary_style, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): - BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), - head_num_(head_num), - size_per_head_(size_per_head), - inter_size_(inter_size), - num_layer_(num_layer), - rotary_embedding_dim_(rotary_embedding_dim), - neox_rotary_style_(neox_rotary_style), - layernorm_eps_(layernorm_eps), - hidden_units_(head_num_ * size_per_head), - pipeline_para_(pipeline_para), - custom_all_reduce_comm_(custom_all_reduce_comm), - enable_custom_all_reduce_(enable_custom_all_reduce) -{ - initialize(); -} - -template -LLaMADecoder::LLaMADecoder(LLaMADecoder const& decoder): - BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), - head_num_(decoder.head_num_), - size_per_head_(decoder.size_per_head_), - inter_size_(decoder.inter_size_), - num_layer_(decoder.num_layer_), - rotary_embedding_dim_(decoder.rotary_embedding_dim_), - neox_rotary_style_(decoder.neox_rotary_style_), - layernorm_eps_(decoder.layernorm_eps_), - hidden_units_(decoder.hidden_units_), - pipeline_para_(decoder.pipeline_para_), - custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), - enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) -{ - initialize(); -} - -template -LLaMADecoder::~LLaMADecoder() -{ - delete self_attention_layer_; - delete ffn_layer_; - freeBuffer(); -} - -template -void LLaMADecoder::forward(std::vector* output_tensors, - const std::vector* input_tensors, - const std::vector*>* llama_decoder_layer_weight) -{ - FT_CHECK(false); -} - -template -void LLaMADecoder::forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const std::vector*>* llama_decoder_layer_weight) -{ - // input tensors: - // decoder_input [local_batch_size, hidden_dimension], - // finished [local_batch_size], - // sequence_lengths [local_batch_size] - // total_padding_tokens [local_batch_size], - // max_input_length [1] on cpu - // step [1] on cpu - // ite [1] on cpu - // cache_indirection [local_batch_size / beam_width, beam_width, memory_len] - // Here, local_batch_size contains the beam_width, so local_batch_size / beam_width - // is real local_batch_size. - // masked_tokens[local_batch_size, memory_len] - - // output tensors: - // decoder_output [local_batch_size, hidden_dimension], - // key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x] - // value_cache [num_layer, batch_size, head_num, memory_len, size_per_head] - - FT_CHECK(input_tensors->size() == 11); - FT_CHECK(output_tensors->size() == 3); - - const DataType data_type = getTensorType(); - const size_t local_batch_size = input_tensors->at("decoder_input").shape[0]; - allocateBuffer(local_batch_size); - const int ite = input_tensors->at("ite").getVal(); - - T* decoder_input = input_tensors->at("decoder_input").getPtr(); - T* decoder_output = output_tensors->at("decoder_output").getPtr(); - - Tensor& k_cache = output_tensors->at("key_cache"); - Tensor& v_cache = output_tensors->at("value_cache"); - std::vector self_k_cache_size; - self_k_cache_size.push_back(local_batch_size); - for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { - self_k_cache_size.push_back(*t); - } - std::vector self_v_cache_size; - self_v_cache_size.push_back(local_batch_size); - for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { - self_v_cache_size.push_back(*t); - } - - for (uint l = 0; l < num_layer_; l++) { - if (isValidLayerParallelId(l) == false) { - continue; - } - T* layer_input = (l == 0) ? decoder_input : decoder_layer_output_; - T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_; - - if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { - int data_size = local_batch_size * hidden_units_; - // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, - // stream_); - - ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - } - - invokeGeneralLLaMALayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, - layernorm_eps_, - local_batch_size, - hidden_units_, - stream_); - sync_check_cuda_error(); - - TensorMap self_attention_input_tensors(*input_tensors); - self_attention_input_tensors.insert( - "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}); - - size_t cache_offset = l - getFirstLayerParallelId(); - for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { - cache_offset *= *t; - }; - size_t ite_cache_offset = ite * local_batch_size; - for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { - ite_cache_offset *= *t; - } - cache_offset += ite_cache_offset; - - TensorMap self_attention_output_tensors{ - {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; - - self_attention_layer_->forward(&self_attention_output_tensors, - &self_attention_input_tensors, - &llama_decoder_layer_weight->at(l)->self_attention_weights); - - invokeGeneralAddBiasResidualPreLayerNorm( - self_attn_output_, - decoder_normed_input_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, - layernorm_eps_, - local_batch_size, - hidden_units_, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - 0, - stream_); - - TensorMap ffn_input_tensors( - {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}}); - TensorMap ffn_output_tensors( - {{"ffn_output", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, layer_output}}}); - ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); - - invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - local_batch_size, - hidden_units_, - stream_); - - sync_check_cuda_error(); - - if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 - && pipeline_para_.world_size_ > 1) { - int data_size = local_batch_size * hidden_units_; - ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); - } - } - - if (is_free_buffer_after_forward_ == true) { - freeBuffer(); - } -} - -template class LLaMADecoder; -template class LLaMADecoder; - -} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h deleted file mode 100644 index 773637d65..000000000 --- a/src/fastertransformer/models/llama/LLaMADecoder.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include "src/fastertransformer/kernels/add_residual_kernels.h" -#include "src/fastertransformer/kernels/layernorm_kernels.h" -#include "src/fastertransformer/layers/BaseLayer.h" -#include "src/fastertransformer/layers/FfnLayer.h" -#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" -#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" -#include "src/fastertransformer/utils/Tensor.h" -#include "src/fastertransformer/utils/allocator.h" -#include "src/fastertransformer/utils/cublasMMWrapper.h" -#include "src/fastertransformer/utils/custom_ar_comm.h" -#include "src/fastertransformer/utils/nccl_utils.h" - -namespace fastertransformer { - -template -class LLaMADecoder: public BaseLayer { -private: -protected: - void allocateBuffer() override; - void allocateBuffer(size_t batch_size); - void freeBuffer() override; - bool isValidLayerParallelId(uint l); - bool isFirstLayerParallelId(uint l); - bool isLastLayerParallelId(uint l); - int getFirstLayerParallelId(); - virtual void initialize(); - - // meta data - size_t head_num_; - size_t size_per_head_; - size_t inter_size_; - size_t num_layer_; - size_t rotary_embedding_dim_; - bool neox_rotary_style_; - size_t hidden_units_; - float layernorm_eps_; - - NcclParam pipeline_para_; - - std::shared_ptr custom_all_reduce_comm_; - int enable_custom_all_reduce_; - - T* decoder_normed_input_ = nullptr; - T* self_attn_output_ = nullptr; - T* ffn_output_ = nullptr; - T* decoder_layer_output_ = nullptr; - - BaseAttentionLayer* self_attention_layer_; - FfnLayer* ffn_layer_; - -public: - LLaMADecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - bool neox_rotary_style, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce_ = 0); - - LLaMADecoder(LLaMADecoder const& decoder); - - virtual ~LLaMADecoder(); - - virtual void forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const std::vector*>* decoder_layer_weights); - - virtual void forward(std::vector* output_tensors, - const std::vector* input_tensors, - const std::vector*>* decoder_layer_weights); -}; - -} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc index 3c40613fc..ff2ec11be 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -59,19 +59,19 @@ LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh hidden_units_(other.hidden_units_), inter_size_(other.inter_size_) { mallocWeights(); - //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); nullptr; + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); - //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); - //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); - //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); - //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); setWeightPtr(); } @@ -84,19 +84,19 @@ LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADec mallocWeights(); - //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); - //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); - //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); - //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); - //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); setWeightPtr(); return *this; @@ -107,8 +107,8 @@ void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType { FT_CHECK(is_maintain_buffer == true); -// loadWeightFromBin( -// weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type); loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type); @@ -116,67 +116,60 @@ void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType {(size_t)hidden_units_, (size_t)(3 * hidden_units_)}, dir_path + ".attention.query_key_value.weight.bin", model_file_type); -// loadWeightFromBin(weights_ptr[3], -// {(size_t)(3 * hidden_units_)}, -// dir_path + ".attention.query_key_value.bias.bin", -// model_file_type); + loadWeightFromBin(weights_ptr[3], + {(size_t)(3 * hidden_units_)}, + dir_path + ".attention.query_key_value.bias.bin", + model_file_type); loadWeightFromBin(weights_ptr[4], {(size_t)(hidden_units_), (size_t)hidden_units_}, dir_path + ".attention.wo.weight.bin", model_file_type); -// loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[6], {(size_t)hidden_units_, (size_t)(inter_size_)}, dir_path + ".feed_forward.w1.weight.bin", model_file_type); -// loadWeightFromBin( -// weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[8], {(size_t)(inter_size_), (size_t)hidden_units_}, dir_path + ".feed_forward.w2.weight.bin", model_file_type); -// loadWeightFromBin( -// weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[10], {(size_t)hidden_units_, (size_t)(inter_size_)}, dir_path + ".feed_forward.w3.weight.bin", model_file_type); -// loadWeightFromBin( -// weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type); -// loadWeightFromBin(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type); } template void LLaMADecoderLayerWeight::setWeightPtr() { - //pre_layernorm_weights.beta = weights_ptr[0]; - pre_layernorm_weights.beta = nullptr; + pre_layernorm_weights.beta = weights_ptr[0]; pre_layernorm_weights.gamma = weights_ptr[1]; self_attention_weights.query_weight.kernel = weights_ptr[2]; - //self_attention_weights.query_weight.bias = weights_ptr[3]; - self_attention_weights.query_weight.bias = nullptr; + self_attention_weights.query_weight.bias = weights_ptr[3]; self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; - //self_attention_weights.attention_output_weight.bias = weights_ptr[5]; - self_attention_weights.attention_output_weight.bias = nullptr; + self_attention_weights.attention_output_weight.bias = weights_ptr[5]; ffn_weights.intermediate_weight.kernel = weights_ptr[6]; - //ffn_weights.intermediate_weight.bias = weights_ptr[7]; - ffn_weights.intermediate_weight.bias = nullptr; + ffn_weights.intermediate_weight.bias = weights_ptr[7]; ffn_weights.output_weight.kernel = weights_ptr[8]; - //ffn_weights.output_weight.bias = weights_ptr[9]; - ffn_weights.output_weight.bias = nullptr; + ffn_weights.output_weight.bias = weights_ptr[9]; ffn_weights.gating_weight.kernel = weights_ptr[10]; - //ffn_weights.gating_weight.bias = weights_ptr[11]; - ffn_weights.gating_weight.bias = nullptr; + ffn_weights.gating_weight.bias = weights_ptr[11]; - //post_attention_layernorm_weights.beta = weights_ptr[12]; - post_attention_layernorm_weights.beta = nullptr; + post_attention_layernorm_weights.beta = weights_ptr[12]; post_attention_layernorm_weights.gamma = weights_ptr[13]; is_maintain_buffer = true; } @@ -184,20 +177,20 @@ void LLaMADecoderLayerWeight::setWeightPtr() template void LLaMADecoderLayerWeight::mallocWeights() { - //deviceMalloc(&weights_ptr[0], hidden_units_); + deviceMalloc(&weights_ptr[0], hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_); - //deviceMalloc(&weights_ptr[3], 3 * hidden_units_); + deviceMalloc(&weights_ptr[3], 3 * hidden_units_); deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_); - //deviceMalloc(&weights_ptr[5], hidden_units_); + deviceMalloc(&weights_ptr[5], hidden_units_); deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_); - //deviceMalloc(&weights_ptr[7], inter_size_); + deviceMalloc(&weights_ptr[7], inter_size_); deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_); - //deviceMalloc(&weights_ptr[9], hidden_units_); + deviceMalloc(&weights_ptr[9], hidden_units_); deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_); - //deviceMalloc(&weights_ptr[11], inter_size_); - //deviceMalloc(&weights_ptr[12], hidden_units_); + deviceMalloc(&weights_ptr[11], inter_size_); + deviceMalloc(&weights_ptr[12], hidden_units_); deviceMalloc(&weights_ptr[13], hidden_units_); } diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index 81a22a51d..f7081de11 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -78,7 +78,7 @@ LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): { mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); @@ -105,7 +105,7 @@ LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); - //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); @@ -123,7 +123,7 @@ template void LLaMAWeight::setWeightPtr() { pre_decoder_embedding_table = weights_ptr[0]; - //post_decoder_layernorm.beta = weights_ptr[1]; + post_decoder_layernorm.beta = weights_ptr[1]; post_decoder_layernorm.beta = nullptr; post_decoder_layernorm.gamma = weights_ptr[2]; post_decoder_embedding.kernel = weights_ptr[3]; @@ -135,7 +135,7 @@ void LLaMAWeight::mallocWeights() weights_ptr.resize(num_base_weights); deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); - //deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); deviceMalloc(&weights_ptr[2], hidden_units_); deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); @@ -152,7 +152,7 @@ void LLaMAWeight::loadModel(std::string dir_path) {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.tok_embeddings.weight.bin", model_file_type); - //loadWeightFromBin(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type); loadWeightFromBin(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type); loadWeightFromBin(weights_ptr[3], {(size_t)(vocab_size_ * hidden_units_)}, From 1494d2fe74c3b6a9226b30a9139a0c38ea66df96 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 15 Sep 2023 23:40:07 +0000 Subject: [PATCH 11/55] dump --- examples/cpp/llama/llama_config.ini | 6 +- examples/cpp/llama/llama_example.cc | 74 +--- examples/cpp/llama/llama_example_utils.cc | 6 +- examples/cpp/llama/start_ids.csv | 12 +- .../LLaMAContextAttentionLayer.cc | 342 ++++++--------- .../LLaMAContextAttentionLayer.h | 87 ++-- src/fastertransformer/models/llama/LLaMA.cc | 390 ++++++------------ src/fastertransformer/models/llama/LLaMA.h | 22 +- .../models/llama/LLaMAContextDecoder.cc | 276 ++++++------- .../models/llama/LLaMAWeight.cc | 1 - 10 files changed, 437 insertions(+), 779 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 68f4663d1..9cb766533 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -7,8 +7,7 @@ pipeline_para_size=4 [request] beam_width=1 # beam width for beam search -request_batch_size=8 # determine by the request -request_output_len=0 # determine by the request +request_batch_size=4 # determine by the request [llama_33B] head_num=52 @@ -17,5 +16,4 @@ vocab_size=32000 decoder_layers=60 rotary_embedding=128 multiple_of=256 -start_id=0 -end_id=2 +padding_id=0 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index c1f4521bf..2955cbb14 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -81,16 +81,13 @@ void llama_example(const INIReader reader) const size_t decoder_layers = reader.GetInteger(model_name, "decoder_layers"); const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); const int multiple_of = reader.GetInteger(model_name, "multiple_of"); - const int start_id = reader.GetInteger(model_name, "start_id"); - const int end_id = reader.GetInteger(model_name, "end_id"); const size_t hidden_units = head_num * size_per_head; const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of); - const size_t beam_width = reader.GetInteger("request", "beam_width"); const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); - const int request_output_len = reader.GetInteger("request", "request_output_len"); const int min_length = reader.GetInteger("request", "min_length", 0); + const int padding_id = reader.GetInteger(model_name, "padding_id"); FT_CHECK(decoder_layers % pipeline_para_size == 0); @@ -128,29 +125,6 @@ void llama_example(const INIReader reader) NcclParam pipeline_para; ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size); - // Handle bad_words dictionary - std::vector bad_words; - read_word_list("../examples/cpp/llama/bad_words.csv", bad_words); - - int* d_bad_words = nullptr; - deviceMalloc(&d_bad_words, bad_words.size(), false); - cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size()); - - // Handle stop_words dictionary - std::vector stop_words; - read_word_list("../examples/cpp/llama/stop_words.csv", stop_words); - - const size_t stop_words_len = stop_words.size() / 2; - // Tile with same dict for each element - std::vector tiled_stop_words; - for (int i = 0; i < request_batch_size; i++) { - tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end()); - } - - int* d_stop_words = nullptr; - deviceMalloc(&d_stop_words, tiled_stop_words.size(), false); - cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size()); - // Read ids of request from file. size_t max_input_len = -1; std::vector v_start_lengths; @@ -159,7 +133,7 @@ void llama_example(const INIReader reader) &v_start_lengths, &v_start_ids, max_input_len, - end_id, + padding_id, 1, "../examples/cpp/llama/start_ids.csv"); @@ -177,10 +151,8 @@ void llama_example(const INIReader reader) cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); } - std::vector start_ids(request_batch_size, start_id); - std::vector end_ids(request_batch_size, end_id); - const int total_output_len = max_input_len + request_output_len; + const int total_output_len = max_input_len; cudaStream_t stream; cublasHandle_t cublas_handle; @@ -203,12 +175,8 @@ void llama_example(const INIReader reader) cublas_wrapper.setFP32GemmConfig(); } - fastertransformer::LLaMAWeight llama_weights(hidden_units, - inter_size, - vocab_size, - decoder_layers, - pipeline_para.world_size_, - pipeline_para.rank_); + fastertransformer::LLaMAWeight llama_weights( + hidden_units, inter_size, vocab_size, decoder_layers, pipeline_para.world_size_, pipeline_para.rank_); model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; llama_weights.loadModel(model_dir); @@ -234,22 +202,20 @@ void llama_example(const INIReader reader) decoder_layers, vocab_size, rotary_embedding_dim, - start_id, - end_id, random_seed, tensor_para, pipeline_para, stream, &cublas_wrapper, &allocator, - false, + false, // is_free_buffer_after_forward &prop, attention_type); int* d_output_ids; int* d_sequence_lengths; - deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); - deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); + deviceMalloc(&d_output_ids, request_batch_size * total_output_len, false); + deviceMalloc(&d_sequence_lengths, request_batch_size, false); std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", @@ -257,27 +223,18 @@ void llama_example(const INIReader reader) {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, {"output_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, - {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}}, - {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, - {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, - {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; - - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, - std::vector{request_batch_size, beam_width, (size_t)total_output_len}, + std::vector{request_batch_size, 1, (size_t)total_output_len}, d_output_ids}}, {"sequence_length", - Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths}}, - {"output_log_probs", - Tensor{MEMORY_GPU, - TYPE_FP32, - std::vector{(size_t)request_output_len, request_batch_size, beam_width}, - nullptr}}}; + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_sequence_lengths}}, + }; print_mem_usage(); @@ -307,7 +264,7 @@ void llama_example(const INIReader reader) printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); } else { - size_t outCount = total_output_len * request_batch_size * beam_width; + size_t outCount = total_output_len * request_batch_size; int* hBuf = new int[outCount]; cudaD2Hcpy(hBuf, d_output_ids, outCount); @@ -357,10 +314,9 @@ void llama_example(const INIReader reader) cudaProfilerStop(); - printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d" + printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d" " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n", request_batch_size, - beam_width, head_num, size_per_head, total_output_len, @@ -374,8 +330,6 @@ void llama_example(const INIReader reader) delete cublas_algo_map; delete cublas_wrapper_mutex; - cudaFree(d_bad_words); - cudaFree(d_stop_words); if (d_input_ids != nullptr) { cudaFree(d_input_ids); } diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc index 77f621dbf..d6db80856 100644 --- a/examples/cpp/llama/llama_example_utils.cc +++ b/examples/cpp/llama/llama_example_utils.cc @@ -26,7 +26,7 @@ int read_start_ids(size_t batch_size, std::vector* v_start_lengths, std::vector* v_start_ids, size_t& max_input_len, - const int end_id, + const int padding_id, const int beam_width, std::string file_name) { @@ -68,7 +68,7 @@ int read_start_ids(size_t batch_size, while ((int)tmp_start_lengths.size() < batch_size) { std::vector padding_ids; for (int i = 0; i < max_input_len; i++) { - padding_ids.push_back(end_id); + padding_ids.push_back(padding_id); } tmp_start_ids.push_back(padding_ids); tmp_start_lengths.push_back(max_input_len); @@ -77,7 +77,7 @@ int read_start_ids(size_t batch_size, // Add padding for (int i = 0; i < (int)tmp_start_ids.size(); i++) { for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) { - tmp_start_ids[i].push_back(end_id); + tmp_start_ids[i].push_back(padding_id); } } diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 88e742f39..a74083153 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,8 +1,4 @@ -688, 253, 1390, 4564, 273, 1897, 13, 247 -510, 1457, 8911, 4487, 273, 26593, 310, 6600 -510, 1457, 2816, 28260, 452, 247, 747, 1481 -510, 1457, 2816, 7717, 556, 3863, 697, 7970 -688, 247, 2118, 326, 588, 2779, 1056, 352 -510, 1457, 2816, 28260, 8, 13413, 19169, 14745 -510, 9462, 5687, 556, 38350, 26212, 253, 747 -510, 806, 673, 309, 3047, 253, 6440, 13 \ No newline at end of file +1, 14542, 3262, 8112, 29901, 7803, 1757, 526, 13407, 297, 263, 13569, 29889, 2688, 526, 13587, 701, 27815, 29889, 0 +1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023, 6057, 964, 263, 1559, 29889, 0 +1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023, 4947, 297, 263, 1775, 29889, 0 +1, 28551, 292, 29901, 11647, 526, 13407, 373, 263, 17306, 310, 15007, 29889, 11647, 526, 1985, 2768, 263, 5214, 29889 diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 38ec79b47..8837acb82 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -16,6 +16,7 @@ */ #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" #include "src/fastertransformer/kernels/unfused_attention_kernels.h" #include "src/fastertransformer/utils/nvtx_utils.h" @@ -28,16 +29,15 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten { // input_tensors: // input_query [token_num, hidden_dimension] - // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] + // attention_mask [batch_size, 1, seq_len, seq_len] // attention_type [1] // is_final_layer [1], bool on cpu // layer_id [1], int on cpu // padding_offset, int, [token_num] (optional) // cu_seqlens, int, [batch_size] (optional) - // d_prefix_prompt_batch [global_batch_size], (optional) - // each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head] - // d_prefix_prompt_lengths [batch_size], int (optional) - // linear_bias_slopes [head_num] (optional) + // each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head] + // pre_layernorm_weights_gamma [hidden_dimension] + // pre_layernorm_weights_beta [hidden_dimension] // output_tensors: // hidden_features [token_num, hidden_dimension] @@ -45,20 +45,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // value_cache [batch, local_head_num, max_seq_len, size_per_head] FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 5); - FT_CHECK(output_tensors->at("value_cache").shape.size() == 4 - || output_tensors->at("value_cache").shape.size() == 3); - const int request_batch_size = input_tensors->at("attention_mask").shape[0]; - const int request_seq_len = input_tensors->at("attention_mask").shape[2]; - const int max_prompt_length = - input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; - const int layer_id = input_tensors->getVal("layer_id"); - const T** d_prefix_prompt_batch = input_tensors->getPtr("d_prefix_prompt_batch", nullptr); - const int* d_prefix_prompt_lengths = input_tensors->getPtr("d_prefix_prompt_lengths", nullptr); - const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - T* linear_bias_slopes = input_tensors->getPtr("linear_bias_slopes", nullptr); - /* float* attention_query_dynamic_scale = input_tensors->getPtr("attention_query_dynamic_scale", - * nullptr); */ + FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); + const int request_batch_size = input_tensors->at("attention_mask").shape[0]; + const int request_seq_len = input_tensors->at("attention_mask").shape[2]; + const int layer_id = input_tensors->getVal("layer_id"); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const T* pre_layernorm_weights_gamma = input_tensors->getPtr("pre_layernorm_weights_gamma"); + const T* pre_layernorm_weights_beta = input_tensors->getPtr("pre_layernorm_weights_beta"); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -69,89 +63,84 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten "LLaMA Context FUSED_PADDED_MHA is not supported !"); PUSH_RANGE("attention buffer alloc"); - allocateBuffer(request_batch_size, request_seq_len + max_prompt_length, attention_type != AttentionType::FUSED_MHA); + allocateBuffer(request_batch_size, request_seq_len, attention_type != AttentionType::FUSED_MHA); POP_RANGE; sync_check_cuda_error(); const bool is_final = input_tensors->at("is_final_layer").getVal(); + const int m = input_tensors->at("input_query").shape[0]; - const int m = input_tensors->at("input_query").shape[0]; + PUSH_RANGE("attention buffer alloc"); + invokeGeneralLLaMALayerNorm(decoder_normed_input_, + attention_input, + pre_layernorm_weights_gamma, + pre_layernorm_weights_beta, + layernorm_eps_, + m, + hidden_units_, + stream_); + sync_check_cuda_error(); + POP_RANGE; + // if (l == 0) { + // T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_); + // cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < h_token_num; ++b) { + // std::cout << "["; + // int i = 0; + // for (int h = 0; h < hidden_units_; ++h) { + // std::cout << out[b * hidden_units_ + h] << " "; + // ++i; + // if (i == 8) + // break; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } + sync_check_cuda_error(); PUSH_RANGE("qkv_gemm"); -#ifdef SPARSITY_ENABLED - const int m_padded = 8 * div_up(m, 8); - bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, 3 * local_hidden_units_, m_padded, hidden_units_); -#else - constexpr bool use_sparse = false; -#endif - - if (use_sparse) { -#ifdef SPARSITY_ENABLED - cublas_wrapper_->SpGemm(CUBLAS_OP_N, - CUBLAS_OP_N, - 3 * local_hidden_units_, - m_padded, - hidden_units_, - attention_weights->query_weight.sp_kernel, - attention_input, - qkv_buf_); -#endif - } - else if (int8_mode_ == 1) { - FT_CHECK(weight_only_int8_fc_runner_.get() != NULL && attention_weights->query_weight.int8_kernel != NULL - && attention_weights->query_weight.weight_only_quant_scale != NULL); - - weight_only_int8_fc_runner_->gemm(attention_input, - reinterpret_cast(attention_weights->query_weight.int8_kernel), - attention_weights->query_weight.weight_only_quant_scale, - qkv_buf_, - m, - 3 * local_hidden_units_, - hidden_units_, - mixed_gemm_workspace_, - mixed_gemm_ws_bytes_, - stream_); - } - else if (int8_mode_ == 2) { - cublas_wrapper_->Int8Gemm(3 * local_hidden_units_, - m, - hidden_units_, - attention_weights->query_weight.int8_kernel, - hidden_units_, - input_tensors->at("input_query").getPtr(), - hidden_units_, - reinterpret_cast(qkv_buf_), - 3 * local_hidden_units_, - attention_weights->query_weight.scale_inter, - true); - } - else { - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - 3 * local_hidden_units_, // n - m, - hidden_units_, // k - attention_weights->query_weight.kernel, - 3 * local_hidden_units_, // n - attention_input, - hidden_units_, // k - qkv_buf_, - 3 * local_hidden_units_ /* n */); - } - + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + 3 * hidden_units_, // n + m, + hidden_units_, // k + attention_weights->query_weight.kernel, + 3 * hidden_units_, // n + decoder_normed_input_, + hidden_units_, // k + qkv_buf_, + 3 * hidden_units_ /* n */); sync_check_cuda_error(); + // if (layer_id == 0) { + // T* qkv_buf = (T*)malloc(sizeof(T) * m * 3 * hidden_units_); + // cudaMemcpy(qkv_buf, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < request_batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < request_seq_len; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << qkv_buf[((b * request_seq_len) + s) * 3 * hidden_units_ + h + 2 * hidden_units_] + // << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } // IDEA: append prefix prompt key value here - PrefixPromptBatchWeightsParam param{d_prefix_prompt_batch, - d_prefix_prompt_lengths, - max_prompt_length, - (size_t)layer_id * 2 * local_head_num_ * size_per_head_}; + PrefixPromptBatchWeightsParam param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_}; if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync( - q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_); + cudaMemsetAsync(q_buf_2_, 0, request_batch_size * request_seq_len * 3 * hidden_units_ * sizeof(T), stream_); } invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, @@ -163,12 +152,12 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten request_batch_size, request_seq_len, m, - local_head_num_, + head_num_, size_per_head_, rotary_embedding_dim_, neox_rotary_style_, attention_weights->query_weight.scale_out, - int8_mode_, + 0, // int8_mode stream_); sync_check_cuda_error(); @@ -181,13 +170,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten k_buf_2_, v_buf_2_, request_batch_size, - max_prompt_length + request_seq_len, // max input length + prefix prompt length + request_seq_len, max_seq_len, size_per_head_, - local_head_num_, + head_num_, stream_); - // IDEA : after this, k_cache = (batch_size, num_heads, Dh/x, prefix_prompt_len + L, x) - // k_cache = (batch_size, num_heads, prefix_prompt_len + L, Dh) + // IDEA : after this, + // k_cache = (batch_size, num_heads, Dh/x, L, x) + // v_cache = (batch_size, num_heads, L, Dh) sync_check_cuda_error(); // TODO: fmha kernels doesn't support different seq lengths of q and kv @@ -200,8 +190,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten POP_RANGE; if (is_final == false) { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = request_seq_len; // q length - const int attention_seq_len_2 = max_prompt_length + request_seq_len; // kv length + const int attention_seq_len_1 = request_seq_len; // q length + const int attention_seq_len_2 = request_seq_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); if (attention_type != AttentionType::FUSED_MHA) { if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { @@ -225,7 +215,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten CUDA_R_32F, attention_seq_len_2, // n attention_seq_len_2 * attention_seq_len_1, - request_batch_size * local_head_num_, // global batch size + request_batch_size * head_num_, // global batch size CUDA_R_32F); sync_check_cuda_error(); @@ -239,9 +229,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten param.batch_size = request_batch_size; param.q_length = attention_seq_len_1; param.k_length = attention_seq_len_2; - param.num_heads = local_head_num_; + param.num_heads = head_num_; param.qk_scale = qk_scale; - param.linear_bias_slopes = const_cast(linear_bias_slopes); // (head_num,), optional + param.linear_bias_slopes = nullptr; invokeMaskedSoftmax(param, stream_); sync_check_cuda_error(); POP_RANGE; @@ -262,7 +252,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qk_buf_, attention_seq_len_2, attention_seq_len_2 * attention_seq_len_1, - request_batch_size * local_head_num_); + request_batch_size * head_num_); POP_RANGE; PUSH_RANGE("softmax"); @@ -273,9 +263,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten param.batch_size = request_batch_size; param.q_length = attention_seq_len_1; param.k_length = attention_seq_len_2; - param.num_heads = local_head_num_; + param.num_heads = head_num_; param.qk_scale = qk_scale; - param.linear_bias_slopes = const_cast(linear_bias_slopes); // (head_num,), optional + param.linear_bias_slopes = nullptr; invokeMaskedSoftmax(param, stream_); sync_check_cuda_error(); POP_RANGE; @@ -297,7 +287,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_2_, size_per_head_, attention_seq_len_1 * size_per_head_, - request_batch_size * local_head_num_); + request_batch_size * head_num_); // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) if (padding_offset == nullptr) { @@ -305,10 +295,10 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_2_, request_batch_size, attention_seq_len_1, - local_head_num_, + head_num_, size_per_head_, attention_weights->attention_output_weight.scale, - int8_mode_, + 0, // int8_mode stream_); sync_check_cuda_error(); } @@ -318,11 +308,11 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten m, request_batch_size, attention_seq_len_1, - local_head_num_, + head_num_, size_per_head_, padding_offset, attention_weights->attention_output_weight.scale, - int8_mode_, + 0, // int8_mode stream_); } POP_RANGE; @@ -330,68 +320,18 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); PUSH_RANGE("proj gemm"); -#ifdef SPARSITY_ENABLED - bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m_padded, local_hidden_units_); -#endif - if (use_sparse) { -#ifdef SPARSITY_ENABLED - cublas_wrapper_->SpGemm(CUBLAS_OP_N, - CUBLAS_OP_N, - hidden_units_, - m_padded, - local_hidden_units_, - attention_weights->attention_output_weight.sp_kernel, - qkv_buf_3_, - attention_out); -#endif - } - else { - if (int8_mode_ == 1) { - FT_CHECK(weight_only_int8_fc_runner_.get() != NULL - && attention_weights->attention_output_weight.int8_kernel != NULL - && attention_weights->attention_output_weight.weight_only_quant_scale != NULL); - - weight_only_int8_fc_runner_->gemm( - qkv_buf_3_, - reinterpret_cast(attention_weights->attention_output_weight.int8_kernel), - attention_weights->attention_output_weight.weight_only_quant_scale, - attention_out, - m, - hidden_units_, - local_hidden_units_, - mixed_gemm_workspace_, - mixed_gemm_ws_bytes_, - stream_); - } - else if (int8_mode_ == 2) { - int8_fc_runner_->gemm(reinterpret_cast(qkv_buf_3_), - attention_weights->attention_output_weight.int8_kernel, - QuantMode::PerTensorQuant, - attention_weights->attention_output_weight.scale_inter, - attention_weights->attention_output_weight.scale_out, - output_tensors->at("hidden_features").getPtr(), - m, - hidden_units_, - local_hidden_units_, - nullptr, - 0, - stream_); - } - else { - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - hidden_units_, - m, - local_hidden_units_, - attention_weights->attention_output_weight.kernel, - hidden_units_, - qkv_buf_3_, - local_hidden_units_, - attention_out, - hidden_units_); - } - } + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + hidden_units_, + m, + hidden_units_, + attention_weights->attention_output_weight.kernel, + hidden_units_, + qkv_buf_3_, + hidden_units_, + attention_out, + hidden_units_); POP_RANGE; } @@ -420,14 +360,11 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), - local_head_num_(head_num), - local_hidden_units_(local_head_num_ * size_per_head), rotary_embedding_dim_(0), neox_rotary_style_(false), is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), - int8_mode_(int8_mode) + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) { } @@ -450,17 +387,14 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), - local_head_num_(local_head_num), - local_hidden_units_(local_head_num_ * size_per_head), rotary_embedding_dim_(0), neox_rotary_style_(false), is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), - int8_mode_(int8_mode) + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f)); + dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); } template @@ -484,17 +418,14 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), - local_head_num_(local_head_num), - local_hidden_units_(local_head_num_ * size_per_head), rotary_embedding_dim_(rotary_embedding_dim), neox_rotary_style_(neox_rotary_style), is_qk_buf_float_(is_qk_buf_float), weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr), - int8_mode_(int8_mode) + int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f)); + dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); } template @@ -509,14 +440,11 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionL head_num_(attention_layer.head_num_), size_per_head_(attention_layer.size_per_head_), hidden_units_(attention_layer.hidden_units_), - local_head_num_(attention_layer.local_head_num_), - local_hidden_units_(attention_layer.local_hidden_units_), rotary_embedding_dim_(attention_layer.rotary_embedding_dim_), neox_rotary_style_(attention_layer.neox_rotary_style_), is_qk_buf_float_(attention_layer.is_qk_buf_float_), weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_), - int8_fc_runner_(attention_layer.int8_fc_runner_), - int8_mode_(attention_layer.int8_mode_) + int8_fc_runner_(attention_layer.int8_fc_runner_) { } @@ -537,54 +465,33 @@ template void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T); - // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels. - const auto type_size = sizeof(T); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true); - k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_; - v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_; + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true); + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true); + k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; + v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len, true); + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true); } else { allocator_->free((void**)(&qk_buf_)); } - qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * local_hidden_units_, true); - qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, type_size * batch_size * seq_len * local_hidden_units_, true); + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true); if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * local_head_num_ * seq_len * seq_len, true); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true); } else { allocator_->free((void**)(&qk_buf_float_)); } } - if (int8_mode_ == 1) { - // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max - // possible memory that would be required by any of the individual gemms. - const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); - mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); - mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false); - } - - if (int8_mode_ == 1) { - // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max - // possible memory that would be required by any of the individual gemms. - const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); - mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); - mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false); - } - else if (int8_mode_ == 2) { - const int max_size = std::max(hidden_units_, 3 * local_hidden_units_); - int8_gemm_ws_bytes_ = int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size); - int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false); - } is_allocate_buffer_ = true; } @@ -598,6 +505,7 @@ void LLaMAContextAttentionLayer::freeBuffer() allocator_->free((void**)(&qk_buf_)); allocator_->free((void**)(&qkv_buf_2_)); allocator_->free((void**)(&qkv_buf_3_)); + allocator_->free((void**)(&decoder_normed_input_)); if (is_qk_buf_float_ == true) { allocator_->free((void**)(&qk_buf_float_)); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 6a18d734e..e52fdc0a7 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -32,13 +32,12 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { size_t max_seq_len_ = 0; // metadata - const size_t head_num_; - const size_t size_per_head_; - const size_t hidden_units_; - const size_t local_head_num_; - const size_t local_hidden_units_; - const size_t rotary_embedding_dim_; - const bool neox_rotary_style_; + const size_t head_num_; + const size_t size_per_head_; + const size_t hidden_units_; + const size_t rotary_embedding_dim_; + const bool neox_rotary_style_; + static constexpr float layernorm_eps_ = 1e-6f; // fmha runner int sm_ = getSMVersion(); @@ -73,52 +72,48 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { size_t mixed_gemm_ws_bytes_ = 0; char* int8_gemm_workspace_ = nullptr; size_t int8_gemm_ws_bytes_ = 0; - - // int8_mode_ == 0 means we don't use any mechanism related to INT8. - // int8_mode_ == 1 for weight quantized only gemm for GPT - // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales) - const int int8_mode_ = 0; + T* decoder_normed_input_ = nullptr; public: LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, - size_t size_per_head, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, - size_t size_per_head, - size_t local_head_num, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, - size_t size_per_head, - size_t local_head_num, - size_t rotary_embedding_dim, - bool neox_rotary_style_, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + size_t max_seq_len, + size_t head_num, + size_t size_per_head, + size_t local_head_num, + size_t rotary_embedding_dim, + bool neox_rotary_style_, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + bool sparse = false, + int int8_mode = 0); LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index fb8eb4f9c..3b52fe2e1 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -51,61 +51,37 @@ void LLaMA::allocateBuffer() } template -void LLaMA::allocateBuffer( - size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) +void LLaMA::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t batchxbeam = batch_size * beam_width; const size_t self_cache_size = - (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len * hidden_units_; + (num_layer_ / pipeline_para_.world_size_) * batch_size * max_cache_seq_len * hidden_units_; input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); - decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false)); decoder_output_buf_ = - (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false)); - nccl_logits_buf_ = - (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false)); - cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); - finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); - h_finished_buf_ = new bool[batchxbeam]; - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * vocab_size_, false)); + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batch_size, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; - if (beam_width > 1) { - cache_indirections_[0] = - (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true)); - cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len; - } tiled_input_ids_buf_ = - (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); - tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); - tiled_total_padding_count_ = - (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); + (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, true)); + tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, true)); transposed_output_ids_buf_ = - (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); - output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); - parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); - seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); - masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); + (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true)); + output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true)); start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( - context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( - context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); - output_log_probs_buf_ = - (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); - - generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); is_allocate_buffer_ = true; } @@ -115,14 +91,8 @@ void LLaMA::freeBuffer() { if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); - allocator_->free((void**)(&decoder_input_buf_)); allocator_->free((void**)(&decoder_output_buf_)); - allocator_->free((void**)(&normed_decoder_output_buf_)); allocator_->free((void**)(&logits_buf_)); - allocator_->free((void**)(&nccl_logits_buf_)); - allocator_->free((void**)(&cum_log_probs_)); - allocator_->free((void**)(&finished_buf_)); - delete[] h_finished_buf_; allocator_->free((void**)(&sequence_lengths_)); allocator_->free((void**)(&key_cache_)); @@ -132,22 +102,14 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&tiled_input_ids_buf_)); allocator_->free((void**)(&tiled_input_lengths_buf_)); - allocator_->free((void**)(&tiled_total_padding_count_)); allocator_->free((void**)(&transposed_output_ids_buf_)); allocator_->free((void**)(&output_ids_buf_)); - allocator_->free((void**)(&parent_ids_buf_)); - allocator_->free((void**)(&seq_limit_len_)); - allocator_->free((void**)(&masked_tokens_)); - allocator_->free((void**)(&start_ids_buf_)); allocator_->free((void**)(&end_ids_buf_)); allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); - allocator_->free((void**)(&output_log_probs_buf_)); - - allocator_->free((void**)(&generation_should_stop_), true); is_allocate_buffer_ = false; } @@ -160,8 +122,6 @@ LLaMA::LLaMA(size_t head_num, size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, - int start_id, - int end_id, unsigned long long random_seed, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -178,8 +138,6 @@ LLaMA::LLaMA(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), - start_id_(start_id), - end_id_(end_id), hidden_units_(head_num * size_per_head), attention_type_(attention_type) { @@ -195,8 +153,6 @@ LLaMA::LLaMA(size_t head_num, size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, - int start_id, - int end_id, unsigned long long random_seed, NcclParam tensor_para, NcclParam pipeline_para, @@ -215,8 +171,6 @@ LLaMA::LLaMA(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), - start_id_(start_id), - end_id_(end_id), hidden_units_(head_num * size_per_head), pipeline_para_(pipeline_para), custom_all_reduce_comm_(custom_all_reduce_comm), @@ -235,8 +189,6 @@ LLaMA::LLaMA(LLaMA const& llama): num_layer_(llama.num_layer_), vocab_size_(llama.vocab_size_), rotary_embedding_dim_(llama.rotary_embedding_dim_), - start_id_(llama.start_id_), - end_id_(llama.end_id_), hidden_units_(llama.hidden_units_), pipeline_para_(llama.pipeline_para_), custom_all_reduce_comm_(llama.custom_all_reduce_comm_), @@ -284,20 +236,12 @@ void LLaMA::forward(std::unordered_map* output_ten // input_ids [batch_size, max_input_length] // input_lengths [batch_size] // output_seq_len [batch_size] on cpu - // start_id [batch_size] on cpu, optional - // end_id [batch_size] on cpu, optional - // stop_words_list [batch_size, 2, stop_words_length], optional - // bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional // min_length [1] or [batch_size] on cpu, optional, int // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. // output_tensors: - // output_ids [batch_size, beam_width, max_output_seq_len] - // sequence_length [batch_size, beam_width] - // output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*. - // optional. It leads to additional computing cost. If we don't need this result, don't put it. - // cum_log_probs [batch_size, beam], optional, must be float*. - // optional. It leads to additional computing cost. If we don't need this result, don't put it. + // output_ids [batch_size, 1, max_output_seq_len] + // sequence_length [batch_size] // Step is from max_input_length ~ max_output_seq_len, // When step = k, we put output ids and caches at step k, and the sequence_length would be k - 1 before @@ -312,21 +256,19 @@ void LLaMA::forward(std::unordered_map* output_ten FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end() && input_tensors->at("output_seq_len").shape.size() == 1); FT_CHECK(output_tensors->at("output_ids").shape.size() == 3); - FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2); + FT_CHECK(output_tensors->at("sequence_length").shape.size() == 1); FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0], "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]"); const size_t batch_size = output_tensors->at("output_ids").shape[0]; - const size_t beam_width = output_tensors->at("output_ids").shape[1]; // NOTE: Prefix Prompt PreProcessing - // get prefix_prompt_weight for each batch --> shape [batch, beam_width] + // get prefix_prompt_weight for each batch --> shape [batch, 1] // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] int max_input_length = input_tensors->at("input_ids").shape[1]; // Prefix Soft Prompt - const size_t limit_len_offset = (max_input_length == 0 ? 1 : 0); - const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; + const size_t max_output_seq_len = input_tensors->at("output_seq_len").max(); const size_t max_seq_len = max_output_seq_len; // max cache seq len should include max prefix prompt length as it has k/v states const size_t max_cache_seq_len = max_output_seq_len; @@ -343,161 +285,122 @@ void LLaMA::forward(std::unordered_map* output_ten max_seq_len); } const cudaDataType_t gemm_data_type = getCudaDataType(); - allocateBuffer(batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length); - setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); + allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length); sync_check_cuda_error(); const DataType data_type = getTensorType(); const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, - batch_size * beam_width, + batch_size, head_num_, size_per_head_ / (16 / sizeof(T)), max_cache_seq_len, 16 / sizeof(T)}; const std::vector self_v_cache_shape = { - num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, head_num_, max_cache_seq_len, size_per_head_}; + num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_}; // initialize the output ids and parent ids - cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); - cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); - cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_); - cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_); - if (beam_width > 1) { - cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); - } - + cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * max_seq_len, stream_); sync_check_cuda_error(); // handle first step - if (max_input_length > 1) { - invokeTileGptInputs(tiled_input_ids_buf_, - tiled_input_lengths_buf_, - input_tensors->at("input_ids").getPtr(), - input_tensors->at("input_lengths").getPtr(), - batch_size, - beam_width, - max_input_length, - stream_); - sync_check_cuda_error(); - - invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - output_ids_buf_, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - pPromptTuningParam{}, // no p/prompt tuning - tiled_input_ids_buf_, - 1, - max_input_length, - max_input_length, - batch_size * beam_width, - hidden_units_, - stream_); - sync_check_cuda_error(); - - invokeBuildDecoderAttentionMask(input_attention_mask_, - tiled_input_lengths_buf_, - nullptr, - batch_size * beam_width, - max_input_length, - 0, - stream_); - sync_check_cuda_error(); - - std::unordered_map decoder_input_tensors{ - {"decoder_input", - Tensor{MEMORY_GPU, - data_type, - {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, - context_decoder_input_buf_}}, - {"attention_mask", - Tensor{MEMORY_GPU, - data_type, - {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)(max_input_length)}, - input_attention_mask_}}, - {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}}; - - std::unordered_map decoder_output_tensors{ - {"decoder_output", - Tensor{MEMORY_GPU, - data_type, - {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, - context_decoder_output_buf_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}, - {"last_token_hidden_units", - Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; - - llama_context_decoder_->forward( - &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); - sync_check_cuda_error(); - invokeDecodingInitialize(finished_buf_, - sequence_lengths_, - nullptr, - cum_log_probs_, - start_ids_buf_, - batch_size, - beam_width, - max_input_length - 1, - stream_); - sync_check_cuda_error(); - } - else if (max_input_length == 0) { - max_input_length++; - invokeDecodingInitialize(finished_buf_, - sequence_lengths_, - output_ids_buf_, - cum_log_probs_, - start_ids_buf_, - batch_size, - beam_width, - max_input_length - 1, - stream_); - std::vector h_input_lengths(batch_size * beam_width, 1); - cudaMemcpyAsync(tiled_input_lengths_buf_, - h_input_lengths.data(), - sizeof(int) * batch_size * beam_width, - cudaMemcpyHostToDevice, + invokeTileGptInputs(tiled_input_ids_buf_, + tiled_input_lengths_buf_, + input_tensors->at("input_ids").getPtr(), + input_tensors->at("input_lengths").getPtr(), + batch_size, + 1, + max_input_length, stream_); - sync_check_cuda_error(); - } - else if (max_input_length == 1) { - invokeDecodingInitialize(finished_buf_, - sequence_lengths_, - nullptr, - cum_log_probs_, - start_ids_buf_, - batch_size, - beam_width, - max_input_length - 1, - stream_); - sync_check_cuda_error(); - invokeTileGptInputs(tiled_input_ids_buf_, - tiled_input_lengths_buf_, - input_tensors->at("input_ids").getPtr(), - input_tensors->at("input_lengths").getPtr(), - batch_size, - beam_width, - max_input_length, - stream_); - sync_check_cuda_error(); + sync_check_cuda_error(); - cudaMemcpyAsync(output_ids_buf_, - tiled_input_ids_buf_, - sizeof(int) * batch_size * beam_width, - cudaMemcpyDeviceToDevice, - stream_); - } + invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, + output_ids_buf_, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); +// if (pipeline_para_.rank_ == 0) { +// T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_); +// cudaMemcpy(out, +// context_decoder_input_buf_, +// sizeof(T) * batch_size * max_input_length * hidden_units_, +// cudaMemcpyDeviceToHost); +// sync_check_cuda_error(); +// +// for (int b = 0; b < batch_size; ++b) { +// std::cout << "["; +// for (int s = 0; s < max_input_length; ++s) { +// std::cout << "["; +// for (int h = 0; h < 8; ++h) { +// std::cout << out[b * batch_size * hidden_units_ + s * hidden_units_ + h] << " "; +// } +// std::cout << "]\n"; +// } +// std::cout << "]\n"; +// } +// std::cout << "\n"; +// } + + invokeBuildDecoderAttentionMask( + input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_); + sync_check_cuda_error(); - invokeMaskPaddingTokens(masked_tokens_, - input_tensors->at("input_lengths").getPtr(), // not_tiled - nullptr, - max_cache_seq_len, - max_input_length, - 0, - batch_size, - beam_width, - stream_); + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{ + MEMORY_GPU, data_type, {batch_size, (size_t)max_input_length, hidden_units_}, context_decoder_input_buf_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {batch_size, 1, (size_t)max_input_length, (size_t)(max_input_length)}, + input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}}; + + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {batch_size, (size_t)max_input_length, hidden_units_}, + context_decoder_output_buf_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}, + {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size, hidden_units_}, decoder_output_buf_}}}; + + llama_context_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); + sync_check_cuda_error(); + + // invokeGeneralLLaMALayerNorm( + // context_decoder_input_buf_, + // embedding_input_buf_, + // llama_weights->post_decoder_layernorm.gamma, + // llama_weights->post_decoder_layernorm.beta, + // layernorm_eps_, + // batch_size * max_input_length, + // hidden_units_, + // stream_); + // sync_check_cuda_error(); + // + // cublas_wrapper_->Gemm(CUBLAS_OP_N, + // CUBLAS_OP_N, + // batch_size * max_input_length, + // vocab_size_, + // hidden_units_, + // context_decoder_output_buf_, + // hidden_units_, // n + // llama_weights->post_decoder_embedding.kernel, + // vocab_size_, // k + // /* FIXME */, + // hidden_units_ /* n */); + // sync_check_cuda_error(); setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); @@ -551,43 +454,19 @@ void LLaMA::setOutputTensors(std::unordered_map* o } const size_t batch_size = output_tensors->at("output_ids").shape[0]; - const size_t beam_width = output_tensors->at("output_ids").shape[1]; uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); if (input_tensors->at("input_ids").shape[1] == 0) { invokeCudaD2DcpyConvert( sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); // TODO: D2D sequence_lenghts - if (beam_width > 1) { - // For beam search, do gather_tree - // take output_parent_ids as inter buffer - invokeGatherTree(transposed_output_ids_buf_, - sequence_lengths_, - max_output_seq_len, - batch_size, - beam_width, - output_ids_buf_ + batch_size * beam_width, - parent_ids_buf_ + batch_size * beam_width, - end_ids_buf_, - stream_); - - // transpose and take output_parent_ids as inter buffer - invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), - transposed_output_ids_buf_, - max_output_seq_len - 1, - batch_size * beam_width, - 1, - stream_); - } - else { - // For sampling, only copy the results to output_tensor - invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), - output_ids_buf_ + batch_size * beam_width, - max_output_seq_len - 1, - batch_size * beam_width, - 1, - stream_); - } + // For sampling, only copy the results to output_tensor + invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), + output_ids_buf_ + batch_size, + max_output_seq_len - 1, + batch_size, + 1, + stream_); } else { @@ -599,9 +478,9 @@ void LLaMA::setOutputTensors(std::unordered_map* o param.max_sequence_length_final_step = 1; param.max_time = max_output_seq_len; param.batch_size = batch_size; - param.beam_width = beam_width; + param.beam_width = 1; param.step_ids = output_ids_buf_; - param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; + param.parent_ids = nullptr; param.end_tokens = end_ids_buf_; param.max_input_length = max_input_length; param.prefix_soft_prompt_lengths = nullptr; @@ -615,21 +494,6 @@ void LLaMA::setOutputTensors(std::unordered_map* o sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); sync_check_cuda_error(); } - if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) { - invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr(), - output_log_probs_buf_, - input_tensors->at("output_seq_len").max() - max_input_length, - batch_size * beam_width, - 1, - stream_); - } - // Return the cumulative log probability if requested. - if (output_tensors->count("cum_log_probs") > 0) { - Tensor cum_log_probs = output_tensors->at("cum_log_probs"); - FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width, - "The shape of cum_log_probs does not match with batch_size x beam_width."); - cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); - } } template @@ -644,12 +508,6 @@ size_t LLaMA::getPipelineParallelSize() return pipeline_para_.world_size_; } -template -bool* LLaMA::getFinishBuffer() -{ - return finished_buf_; -} - template class LLaMA; template class LLaMA; diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 303236b72..68b7cef4c 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -39,8 +39,6 @@ class LLaMA: public BaseLayer { static constexpr bool neox_rotary_style_ = true; static constexpr float layernorm_eps_ = 1e-6f; - int start_id_; - int end_id_; size_t hidden_units_; NcclParam tensor_para_; @@ -58,27 +56,18 @@ class LLaMA: public BaseLayer { void allocateBuffer() override; void allocateBuffer( - size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); void freeBuffer() override; void initialize(); protected: T* input_attention_mask_; - - T* decoder_input_buf_; T* decoder_output_buf_; - T* normed_decoder_output_buf_; float* logits_buf_; - float* nccl_logits_buf_; - float* cum_log_probs_; - bool* finished_buf_; - bool* h_finished_buf_; int* sequence_lengths_ = nullptr; - int* tiled_total_padding_count_ = nullptr; - uint32_t* seq_limit_len_ = nullptr; T* key_cache_; T* value_cache_; @@ -88,16 +77,11 @@ class LLaMA: public BaseLayer { int* tiled_input_lengths_buf_; int* transposed_output_ids_buf_; int* output_ids_buf_; - int* parent_ids_buf_; int* start_ids_buf_; int* end_ids_buf_; - bool* masked_tokens_ = nullptr; - - bool* generation_should_stop_ = nullptr; T* context_decoder_input_buf_; T* context_decoder_output_buf_; - float* output_log_probs_buf_; // function pointer callback using callback_sig = void(std::unordered_map*, void*); @@ -118,8 +102,6 @@ class LLaMA: public BaseLayer { size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, - int start_id, - int end_id, unsigned long long random_seed, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -136,8 +118,6 @@ class LLaMA: public BaseLayer { size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, - int start_id, - int end_id, unsigned long long random_seed, NcclParam tensor_para, NcclParam pipeline_para, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 119c98041..c373c9d09 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -66,8 +66,6 @@ void LLaMAContextDecoder::allocateBuffer() template void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) { - decoder_normed_input_ = reinterpret_cast( - allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); self_attn_output_ = reinterpret_cast( allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); ffn_output_ = reinterpret_cast( @@ -85,7 +83,6 @@ template void LLaMAContextDecoder::freeBuffer() { if (is_allocate_buffer_ == true) { - allocator_->free((void**)(&decoder_normed_input_)); allocator_->free((void**)(&self_attn_output_)); allocator_->free((void**)(&ffn_output_)); allocator_->free((void**)(&decoder_layer_output_)); @@ -220,7 +217,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] // last_token_hidden_units [batch_size, hidden_dimension] - // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'. + // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'. // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. @@ -238,20 +235,15 @@ void LLaMAContextDecoder::forward(std::unordered_map* T* decoder_output = output_tensors->at("decoder_output").getPtr(); const T* attention_mask = input_tensors->at("attention_mask").getPtr(); - // const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); - const int local_batch_size = batch_size; - FT_CHECK(batch_size % local_batch_size == 0); - const int iteration_num = batch_size / local_batch_size; - Tensor& k_cache = output_tensors->at("key_cache"); Tensor& v_cache = output_tensors->at("value_cache"); std::vector self_k_cache_size; - self_k_cache_size.push_back(local_batch_size); + self_k_cache_size.push_back(batch_size); for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { self_k_cache_size.push_back(*t); } std::vector self_v_cache_size; - self_v_cache_size.push_back(local_batch_size); + self_v_cache_size.push_back(batch_size); for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { self_v_cache_size.push_back(*t); } @@ -259,158 +251,136 @@ void LLaMAContextDecoder::forward(std::unordered_map* AttentionType attention_type = attention_type_; const bool is_unpadded_mha = isUnPaddedMHA(attention_type); - for (int ite = 0; ite < iteration_num; ite++) { - size_t h_token_num = local_batch_size * seq_len; - if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); - invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, - &h_token_num, - padding_offset_, - cu_seqlens_, - base_input_lengths + ite * local_batch_size, - local_batch_size, - seq_len, - stream_); + size_t h_token_num = batch_size * seq_len; + if (is_unpadded_mha) { + const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, + &h_token_num, + padding_offset_, + cu_seqlens_, + base_input_lengths, + batch_size, + seq_len, + stream_); + } + + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; } - for (int l = 0; l < num_layer_; l++) { - if (isValidLayerParallelId(l) == false) { - continue; - } - if (l == 0 && is_unpadded_mha) { - invokeRemovePadding(decoder_layer_output_, - decoder_input + ite * local_batch_size * seq_len * hidden_units_, - padding_offset_, - h_token_num, - hidden_units_, - stream_); - } + if (l == 0 && is_unpadded_mha) { + invokeRemovePadding( + decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); + } - const bool is_final = false; // TODO(bhsueh) remove this flag - T* layer_input = decoder_layer_output_; - T* layer_output = decoder_layer_output_; - if (!is_unpadded_mha) { - if (l == 0) { - layer_input = decoder_input; - layer_input += ite * local_batch_size * seq_len * hidden_units_; - } - if (l == num_layer_ - 1) { - layer_output = decoder_output; - layer_output += ite * local_batch_size * seq_len * hidden_units_; - } + const bool is_final = false; // TODO(bhsueh) remove this flag + T* layer_input = decoder_layer_output_; + T* layer_output = decoder_layer_output_; + if (!is_unpadded_mha) { + if (l == 0) { + layer_input = decoder_input; } - - if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_; - std::cout << __FILE__ << ":" << __LINE__ << "\n"; - std::cout << "Recv: " << layer_output << "," << data_size << "\n"; - ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + if (l == num_layer_ - 1) { + layer_output = decoder_output; } + } + + if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_; + ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + } + + TensorMap self_attention_input_tensors{ + {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, + attention_mask}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, + {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, + {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, + {"pre_layernorm_weights_gamma", + Tensor{MEMORY_GPU, + data_type, + {(size_t)hidden_units_}, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma}}, + {"pre_layernorm_weights_beta", + Tensor{MEMORY_GPU, + data_type, + {(size_t)hidden_units_}, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta}}}; + + if (is_unpadded_mha) { + self_attention_input_tensors.insert("padding_offset", + Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_}); + self_attention_input_tensors.insert("cu_seqlens", + Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}); + } + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + + TensorMap self_attention_output_tensors{ + {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &llama_decoder_layer_weight->at(l)->self_attention_weights); + + if (is_final == false) { + invokeGeneralAddBiasResidualPreLayerNorm( + self_attn_output_, + layer_input, + self_attn_output_, + layer_input, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + (float*)nullptr, + 0, + stream_); + + TensorMap ffn_input_tensors( + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}}); + TensorMap ffn_output_tensors( + {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); + ffn_layer_->forward( + &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); + + invokeAddBiasResidual(layer_output, + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); - invokeGeneralLLaMALayerNorm(decoder_normed_input_, - layer_input, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, - layernorm_eps_, - h_token_num, - hidden_units_, - stream_); sync_check_cuda_error(); - TensorMap self_attention_input_tensors{ - {"input_query", - Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, - {"attention_mask", - Tensor{MEMORY_GPU, - data_type, - {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, - attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, - {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, - {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, - {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; - - if (is_unpadded_mha) { - self_attention_input_tensors.insert("padding_offset", - Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_}); - self_attention_input_tensors.insert( - "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_}); + if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_; + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); } - size_t cache_offset = l - getFirstLayerParallelId(); - for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { - cache_offset *= *t; - }; - size_t ite_cache_offset = ite * local_batch_size; - for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { - ite_cache_offset *= *t; - } - cache_offset += ite_cache_offset; - - TensorMap self_attention_output_tensors{ - {"hidden_features", - Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", - Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; - - self_attention_layer_->forward(&self_attention_output_tensors, - &self_attention_input_tensors, - &llama_decoder_layer_weight->at(l)->self_attention_weights); - - if (is_final == false) { - invokeGeneralAddBiasResidualPreLayerNorm( - self_attn_output_, - decoder_normed_input_, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, - layernorm_eps_, - h_token_num, - hidden_units_, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - 0, - stream_); - - TensorMap ffn_input_tensors( - {{"ffn_input", - Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); - TensorMap ffn_output_tensors( - {{"ffn_output", - Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); - ffn_layer_->forward( - &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); - - invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, - hidden_units_, - stream_); - - sync_check_cuda_error(); - - if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 - && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_; - std::cout << __FILE__ << ":" << __LINE__ << "\n"; - std::cout << "Send: " << layer_output << "," << data_size << "\n"; - ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); - std::cout << __FILE__ << ":" << __LINE__ << "\n"; - } - - if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, - decoder_layer_output_, - padding_offset_, - h_token_num, - head_num_ * size_per_head_, - stream_); - } + if ((l == num_layer_ - 1) && is_unpadded_mha) { + invokeRebuildPadding(decoder_output, + decoder_layer_output_, + padding_offset_, + h_token_num, + head_num_ * size_per_head_, + stream_); } } } diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc index f7081de11..f1c51e340 100644 --- a/src/fastertransformer/models/llama/LLaMAWeight.cc +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -124,7 +124,6 @@ void LLaMAWeight::setWeightPtr() { pre_decoder_embedding_table = weights_ptr[0]; post_decoder_layernorm.beta = weights_ptr[1]; - post_decoder_layernorm.beta = nullptr; post_decoder_layernorm.gamma = weights_ptr[2]; post_decoder_embedding.kernel = weights_ptr[3]; } From 81dc94ad9c71038b94a8ccf4eaf133ac8f6e8631 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 16 Sep 2023 15:05:17 +0000 Subject: [PATCH 12/55] dump --- examples/cpp/llama/llama_config.ini | 1 + examples/cpp/llama/llama_example.cc | 7 ++- .../kernels/unfused_attention_kernels.cu | 3 ++ .../LLaMAContextAttentionLayer.cc | 38 ++++++++++++++-- src/fastertransformer/models/llama/LLaMA.cc | 45 ++++++++++--------- src/fastertransformer/models/llama/LLaMA.h | 2 +- 6 files changed, 68 insertions(+), 28 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 9cb766533..1e92695e5 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -16,4 +16,5 @@ vocab_size=32000 decoder_layers=60 rotary_embedding=128 multiple_of=256 +max_cache_seq_len=1024 padding_id=0 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 2955cbb14..ebdf7cb9e 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -81,13 +81,14 @@ void llama_example(const INIReader reader) const size_t decoder_layers = reader.GetInteger(model_name, "decoder_layers"); const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); const int multiple_of = reader.GetInteger(model_name, "multiple_of"); + const size_t max_cache_seq_len = reader.GetInteger(model_name, "max_cache_seq_len"); const size_t hidden_units = head_num * size_per_head; const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of); const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); const int min_length = reader.GetInteger("request", "min_length", 0); - const int padding_id = reader.GetInteger(model_name, "padding_id"); + const int padding_id = reader.GetInteger(model_name, "padding_id"); FT_CHECK(decoder_layers % pipeline_para_size == 0); @@ -224,7 +225,9 @@ void llama_example(const INIReader reader) {"output_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, - {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}}; + {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}, + {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &max_cache_seq_len}} + }; std::unordered_map output_tensors = std::unordered_map{ {"output_ids", diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index d0fb0a197..b2f7d7809 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1407,6 +1407,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int src_k_idx = token_idx * 3 * n + hidden_idx + n; const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; + if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) + printf("is_masked: %d, do_rotary: %d\n", is_masked, do_rotary); + Vec_t q, k, v; Vec_t q_bias, k_bias, v_bias; if (!is_masked) { diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 8837acb82..2297b9999 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -160,11 +160,43 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten 0, // int8_mode stream_); sync_check_cuda_error(); + if (layer_id == 0) { + // shape: [B, H, L, Dh] + T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_); + T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_; + T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_; + cudaMemcpy(q_buf, + q_buf_2_, + sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_, + cudaMemcpyDeviceToHost); + sync_check_cuda_error(); + + for (int b = 0; b < request_batch_size; ++b) { + std::cout << "["; + for (int h = 0; h < head_num_; ++h) { + std::cout << "["; + for (int s = 0; s < request_seq_len; ++s) { + std::cout << "["; + for (int e = 0; e < 8; ++e) { + std::cout << k_buf[b * head_num_ * request_seq_len * size_per_head_ + + h * request_seq_len * size_per_head_ + + s * size_per_head_ + + e] + << " "; + } + std::cout << "]\n"; + } + std::cout << "]\n"; + } + std::cout << "]\n"; + } + std::cout << "\n"; + } const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // Use batch major - // put k/v_buf from shape [B, H, PL + L, Dh] - // to cache [B, H, Dh/x, PL + L, x] and [B, H, PL + L, Dh/x, x], PL denotes prompt length + // put k/v_buf from shape [B, H, L, Dh] + // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), output_tensors->getPtr("value_cache"), k_buf_2_, @@ -175,7 +207,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten size_per_head_, head_num_, stream_); - // IDEA : after this, + // IDEA : after this, // k_cache = (batch_size, num_heads, Dh/x, L, x) // v_cache = (batch_size, num_heads, L, Dh) sync_check_cuda_error(); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 3b52fe2e1..c74dd4663 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -238,6 +238,7 @@ void LLaMA::forward(std::unordered_map* output_ten // output_seq_len [batch_size] on cpu // min_length [1] or [batch_size] on cpu, optional, int // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. + // max_cache_seq_len [batch_size] on cpu // output_tensors: // output_ids [batch_size, 1, max_output_seq_len] @@ -271,7 +272,7 @@ void LLaMA::forward(std::unordered_map* output_ten const size_t max_output_seq_len = input_tensors->at("output_seq_len").max(); const size_t max_seq_len = max_output_seq_len; // max cache seq len should include max prefix prompt length as it has k/v states - const size_t max_cache_seq_len = max_output_seq_len; + const size_t max_cache_seq_len = input_tensors->at("max_cache_seq_len").max(); if (max_cache_seq_len < max_seq_len) { FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", @@ -327,27 +328,27 @@ void LLaMA::forward(std::unordered_map* output_ten hidden_units_, stream_); sync_check_cuda_error(); -// if (pipeline_para_.rank_ == 0) { -// T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_); -// cudaMemcpy(out, -// context_decoder_input_buf_, -// sizeof(T) * batch_size * max_input_length * hidden_units_, -// cudaMemcpyDeviceToHost); -// sync_check_cuda_error(); -// -// for (int b = 0; b < batch_size; ++b) { -// std::cout << "["; -// for (int s = 0; s < max_input_length; ++s) { -// std::cout << "["; -// for (int h = 0; h < 8; ++h) { -// std::cout << out[b * batch_size * hidden_units_ + s * hidden_units_ + h] << " "; -// } -// std::cout << "]\n"; -// } -// std::cout << "]\n"; -// } -// std::cout << "\n"; -// } + // if (pipeline_para_.rank_ == 0) { + // T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_); + // cudaMemcpy(out, + // context_decoder_input_buf_, + // sizeof(T) * batch_size * max_input_length * hidden_units_, + // cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < max_input_length; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << out[b * batch_size * hidden_units_ + s * hidden_units_ + h] << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } invokeBuildDecoderAttentionMask( input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_); diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 68b7cef4c..386a09cd4 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -36,7 +36,7 @@ class LLaMA: public BaseLayer { size_t vocab_size_; size_t rotary_embedding_dim_; - static constexpr bool neox_rotary_style_ = true; + static constexpr bool neox_rotary_style_ = false; static constexpr float layernorm_eps_ = 1e-6f; size_t hidden_units_; From d5b2c12b846bb8c4cc4336cd4656d3233af798d3 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 16 Sep 2023 15:07:40 +0000 Subject: [PATCH 13/55] for junsik --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index b2f7d7809..8d4c5e6da 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1407,8 +1407,6 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int src_k_idx = token_idx * 3 * n + hidden_idx + n; const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; - if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) - printf("is_masked: %d, do_rotary: %d\n", is_masked, do_rotary); Vec_t q, k, v; Vec_t q_bias, k_bias, v_bias; From 8ec39b5f19ef9abe82be25cc5afe9629e6637b3e Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 16 Sep 2023 20:25:38 +0000 Subject: [PATCH 14/55] first success --- examples/cpp/llama/llama_example.cc | 2 +- .../kernels/layernorm_kernels.cu | 663 +++++++++++++++++- .../kernels/layernorm_kernels.h | 14 + .../kernels/unfused_attention_kernels.cu | 3 +- .../LLaMAContextAttentionLayer.cc | 435 ++++++------ .../LLaMAContextAttentionLayer.h | 2 - src/fastertransformer/models/llama/LLaMA.cc | 70 +- src/fastertransformer/models/llama/LLaMA.h | 1 + .../models/llama/LLaMAContextDecoder.cc | 235 +++++-- .../models/llama/LLaMADecoderLayerWeight.cc | 12 +- 10 files changed, 1109 insertions(+), 328 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index ebdf7cb9e..43f55c4b7 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -84,7 +84,7 @@ void llama_example(const INIReader reader) const size_t max_cache_seq_len = reader.GetInteger(model_name, "max_cache_seq_len"); const size_t hidden_units = head_num * size_per_head; - const size_t inter_size = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of); + const size_t inter_size = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of); const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); const int min_length = reader.GetInteger("request", "min_length", 0); diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index b19e9ac73..6244dbfd6 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -19,6 +19,237 @@ #include "src/fastertransformer/utils/cuda_type_utils.cuh" namespace fastertransformer { +// __global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output, +// __global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output, +// __global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input, + +template +__global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output, + T* output, + const T* __restrict input, + const T* __restrict bias, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const float layernorm_eps, + int m, + int n) +{ + extern __shared__ __align__(sizeof(float)) char _shmem[]; // Align on largest type + T* shmem = reinterpret_cast(_shmem); + + __shared__ float s_variance; + float variance = 0.0f; + + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + T local_sum = cuda_cast(0.0f); + + const Float_Packed_T scale_from_int = cuda_cast(0.0f); + const Float_Packed_T scale_to_int = cuda_cast(0.0f); + +#pragma unroll + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = cuda_cast(0.0f); + + if (IS_BIAS) { + val = hadd2(val, ldg(&bias[i])); + } + if (RESIDUAL_NUM == 1) { + val = hadd2(val, ldg(&residual1[index])); + } + else if (RESIDUAL_NUM == 2) { + val = hadd2(hadd2(val, ldg(&residual1[index])), ldg(&residual2[index])); + } + + if (IS_OUTPUT) { + T in_val; + in_val = input[index]; + val = hadd2(val, in_val); + } + shmem[i] = val; + output[index] = val; + local_sum = hadd2(local_sum, val); + } + + float local_var_sum = 0.0f; +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + T val = input[blockIdx.x * n + i]; + float diff_1 = (float)(val.x); + float diff_2 = (float)(val.y); + local_var_sum += (diff_1 * diff_1 + diff_2 * diff_2); + } + variance = blockReduceSum(local_var_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / n / 2 + layernorm_eps); + } + __syncthreads(); + + T var_2 = cuda_cast(s_variance); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = hmul2(shmem[i], var_2, ldg(&gamma[i])); + if (IS_BETA) { + val = hadd2(val, ldg(&beta[i])); + } + + normed_output[index] = val; + } +} + +// * Note that typename T is half2 or bfloat2 type +template +__global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output, + T* output, + const T* __restrict input, + const T* __restrict bias, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const float layernorm_eps, + int m, + int n) +{ + extern __shared__ __align__(sizeof(float)) char _shmem[]; + T* shmem = reinterpret_cast(_shmem); + + __shared__ float s_variance; + float x2_sum = 0.0f; + const int b_offset = blockIdx.x * n; + + using T1 = typename TypeConverter::Type; + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + const Float_Packed_T scale_vec_in = cuda_cast(0.0f); + const Float_Packed_T scale_vec = cuda_cast(0.0f); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = b_offset + i; + float val_1 = 0.0f; + float val_2 = 0.0f; + T tmp; + + if (IS_BIAS) { + tmp = ldg(&bias[i]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + if (RESIDUAL_NUM == 1) { + tmp = ldg(&residual1[index]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + else if (RESIDUAL_NUM == 2) { + tmp = ldg(&residual1[index]); + T tmp2 = ldg(&residual2[index]); + val_1 += (static_cast(tmp.x) + static_cast(tmp2.x)); + val_2 += (static_cast(tmp.y) + static_cast(tmp2.y)); + } + + if (IS_OUTPUT) { + tmp = ldg(&input[index]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + tmp.x = cuda_cast(val_1); + tmp.y = cuda_cast(val_2); + shmem[i] = tmp; + output[index] = tmp; + x2_sum += val_1 * val_1 + val_2 * val_2; + } + float sum_sq = blockReduceSum(x2_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(sum_sq / n / 2 + layernorm_eps); + } + __syncthreads(); + + T var_2 = cuda_cast(s_variance); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = hmul2(shmem[i], var_2, ldg(&gamma[i])); + if (IS_BETA) { + val = hadd2(val, ldg(&beta[i])); + } + + normed_output[index] = val; + } +} + +template +__global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const T* __restrict bias, + T* output, + T* norm_output, + const float layernorm_eps, + int m, + int n) +{ + int tid = threadIdx.x; + + // NOTE: float shmem may exceed the shared memory limit + extern __shared__ __align__(sizeof(float)) char _shmem[]; + T* shmem = reinterpret_cast(_shmem); + + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + __shared__ float s_variance; + float variance = 0.0f; + float local_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float local_out = 0.0f; + if (RESIDUAL_NUM == 1) { + local_out = (float)(ldg(&residual1[blockIdx.x * n + i])); + } + else if (RESIDUAL_NUM == 2) { + local_out = (float)(ldg(&residual1[blockIdx.x * n + i])) + float(ldg(&residual2[blockIdx.x * n + i])); + } + local_out += (float)(input[blockIdx.x * n + i]); + + if (bias != nullptr) { + local_out += (float)(ldg(&bias[i])); + } + shmem[i] = (T)local_out; + output[blockIdx.x * n + i] = (T)local_out; + local_sum += local_out; + } + + float local_var_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float diff = (float)(output[blockIdx.x * n + i]); + local_var_sum += diff * diff; + } + variance = blockReduceSum(local_var_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / n + layernorm_eps); + } + __syncthreads(); + + for (int i = tid; i < n; i += blockDim.x) { + float beta_val = (beta == nullptr) ? 0.0f : (float)(ldg(&beta[i])); + const float val = (((float)shmem[i] * s_variance) * (float)(ldg(&gamma[i])) + beta_val); + + norm_output[blockIdx.x * n + i] = (T)val; + } +} // * Note that typename T is half2 or bfloat2 type template @@ -841,6 +1072,51 @@ __global__ void generalAddBiasResidualLayerNorm(const T* __restrict input, } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version) +{ + size_t maxbytes = half_n * sizeof(T); + if (opt_version == 1) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNormOpt, + cudaFuncAttributeMaxDynamicSharedMemorySize, + maxbytes)); + } + generalLLaMAAddBiasResidualLayerNormOpt + <<>>( + norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n); + } + else if (opt_version == 2) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNormOpt2, + cudaFuncAttributeMaxDynamicSharedMemorySize, + maxbytes)); + } + generalLLaMAAddBiasResidualLayerNormOpt2 + <<>>( + norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n); + } + else { + FT_CHECK_WITH_INFO(false, "opt_num must be 1 or 2"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T* norm_output, T* output, @@ -919,6 +1195,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T* norm_o } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output) +{ + if (is_output) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version( + norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version); + } + else { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version( + norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T* norm_output, T* output, @@ -990,6 +1322,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T* norm_out } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output) +{ + if (bias != nullptr) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_bias(T* norm_output, T* output, @@ -1061,6 +1449,66 @@ void dispatch_generalAddBiasResidualLayerNormOpt_bias(T* norm_output, } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output, + int residual_num) +{ + if (residual_num == 1) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else if (residual_num == 2) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else { + FT_CHECK_WITH_INFO(false, "residual_num must be 1 or 2"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T* norm_output, T* output, @@ -1136,6 +1584,108 @@ void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T* norm_ } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output, + int residual_num, + int unroll_factor) +{ + switch (unroll_factor) { + case 1: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 2: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 4: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 8: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + default: + FT_CHECK_WITH_INFO(false, "unroll_factor must be 1, 2, 4 or 8"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T* norm_output, T* output, @@ -1263,6 +1813,105 @@ void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T* norm } } +template +void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, + T* norm_output, + const T* input, + const T* residual1, + const T* gamma, + const T* beta, + const T* bias, + const float layernorm_eps, + int m, + int n, + cudaStream_t stream, + int opt_version) +{ + const int residual_num = 1; + if (opt_version > 0 && sizeof(T) == 2 && n % 2 == 0) { + dim3 grid(m); + int half_n = n / 2; + int half_n_32 = (half_n + 31) / 32 * 32; + dim3 block(min(half_n_32, 512)); + int rolls_per_thread = half_n / block.x; + int unroll_factor = 8; + while (unroll_factor > rolls_per_thread && unroll_factor > 1) { + unroll_factor /= 2; + } + + using T2 = typename TypeConverter::Type; + + /* we launch (and instantiate) the kernel by specializing for unroll_factor -> residual_num -> is_bias -> + * opt_version */ + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor((T2*)norm_output, + (T2*)output, + (const T2*)input, + (const T2*)bias, + (const T2*)residual1, + (const T2*)nullptr, + (const T2*)gamma, + (const T2*)beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + true, // is_output + residual_num, + unroll_factor); + } + else { + + dim3 grid(m); + dim3 block(min(n, 1024)); + + /* For general cases, n is equal to hidden_units, e.g., 512/1024. + Since we have warp shuffle inside the code, block.x % 32 should be 0. + */ + block.x = (block.x + 31) / 32 * 32; + + size_t maxbytes = n * sizeof(T); + if (residual_num == 1) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNorm, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes)); + } + generalLLaMAAddBiasResidualLayerNorm<<>>( + input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n); + } + else if (residual_num == 2) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNorm, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes)); + } + generalLLaMAAddBiasResidualLayerNorm<<>>( + input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n); + } + } +} + +#define INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(T) \ + template void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, \ + T* norm_output, \ + const T* input, \ + const T* residual1, \ + const T* gamma, \ + const T* beta, \ + const T* bias, \ + const float layernorm_eps, \ + int m, \ + int n, \ + cudaStream_t stream, \ + int opt_version) +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(float); +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(half); +#ifdef ENABLE_BF16 +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(__nv_bfloat16); +#endif +#undef INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM + /* output <- output + bias + residual_1 + residual_2 * output_norm <- LN(output) */ template @@ -1875,29 +2524,29 @@ __global__ void generalLLaMALayerNorm(const T* __restrict input, extern __shared__ __align__(sizeof(float)) char _shmem[]; T* shmem = reinterpret_cast(_shmem); - __shared__ float s_mean_sq; - float mean_sq = 0.0f; + __shared__ float s_variance; + float variance = 0.0f; using Float_Packed_T = typename packed_as::value>::type; using Scalar_T = typename packed_as::type; - float local_sum = 0.0f; + float local_var_sum = 0.0f; for (int i = tid; i < n; i += blockDim.x) { float val = (float)(ldg(&input[blockIdx.x * n + i])); - local_sum += val * val; + local_var_sum += val * val; } - mean_sq = blockReduceSum(local_sum); + variance = blockReduceSum(local_var_sum); if (threadIdx.x == 0) { - s_mean_sq = rsqrtf(mean_sq / (float)n + layernorm_eps); + s_variance = rsqrtf(variance / (float)n + layernorm_eps); } __syncthreads(); for (int i = tid; i < n; i += blockDim.x) { const int index = blockIdx.x * n + i; float beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]); - T val = (T)(((float)input[index] * s_mean_sq) * (float)(ldg(&gamma[i])) + beta_val); + T val = (T)(((float)input[index] * s_variance) * (float)(ldg(&gamma[i])) + beta_val); normed_output[index] = val; } diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h index 5c5c03c7a..8fb8ecf8b 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.h +++ b/src/fastertransformer/kernels/layernorm_kernels.h @@ -62,6 +62,20 @@ void invokeAddBiasResidualLayerNorm(T* out, const int n, cudaStream_t stream); +template +void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, + T* norm_output, + const T* input, + const T* residual1, + const T* gamma, + const T* beta, + const T* bias, + const float layernorm_eps, + int m, + int n, + cudaStream_t stream, + int opt_version = 2); + template void invokeGeneralAddBiasResidualPreLayerNorm(T* output, T* norm_output, diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 8d4c5e6da..61d2a54ff 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1364,8 +1364,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int tidx = threadIdx.x; const int total_seq_len = param.max_prefix_prompt_length + seq_len; - const bool is_masked = tidx * vec_size >= size_per_head; + // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts if (PREFIX_PROMPT && token_idx < 0) { const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length; @@ -1407,7 +1407,6 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int src_k_idx = token_idx * 3 * n + hidden_idx + n; const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; - Vec_t q, k, v; Vec_t q_bias, k_bias, v_bias; if (!is_masked) { diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 2297b9999..70e638150 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -31,13 +31,10 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // input_query [token_num, hidden_dimension] // attention_mask [batch_size, 1, seq_len, seq_len] // attention_type [1] - // is_final_layer [1], bool on cpu // layer_id [1], int on cpu // padding_offset, int, [token_num] (optional) // cu_seqlens, int, [batch_size] (optional) // each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head] - // pre_layernorm_weights_gamma [hidden_dimension] - // pre_layernorm_weights_beta [hidden_dimension] // output_tensors: // hidden_features [token_num, hidden_dimension] @@ -46,13 +43,11 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 5); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); - const int request_batch_size = input_tensors->at("attention_mask").shape[0]; - const int request_seq_len = input_tensors->at("attention_mask").shape[2]; - const int layer_id = input_tensors->getVal("layer_id"); - const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - const T* pre_layernorm_weights_gamma = input_tensors->getPtr("pre_layernorm_weights_gamma"); - const T* pre_layernorm_weights_beta = input_tensors->getPtr("pre_layernorm_weights_beta"); + const int request_batch_size = input_tensors->at("attention_mask").shape[0]; + const int request_seq_len = input_tensors->at("attention_mask").shape[2]; + const int layer_id = input_tensors->getVal("layer_id"); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -67,40 +62,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten POP_RANGE; sync_check_cuda_error(); - const bool is_final = input_tensors->at("is_final_layer").getVal(); const int m = input_tensors->at("input_query").shape[0]; - PUSH_RANGE("attention buffer alloc"); - invokeGeneralLLaMALayerNorm(decoder_normed_input_, - attention_input, - pre_layernorm_weights_gamma, - pre_layernorm_weights_beta, - layernorm_eps_, - m, - hidden_units_, - stream_); - sync_check_cuda_error(); - POP_RANGE; - // if (l == 0) { - // T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_); - // cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < h_token_num; ++b) { - // std::cout << "["; - // int i = 0; - // for (int h = 0; h < hidden_units_; ++h) { - // std::cout << out[b * hidden_units_ + h] << " "; - // ++i; - // if (i == 8) - // break; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } - sync_check_cuda_error(); - PUSH_RANGE("qkv_gemm"); cublas_wrapper_->Gemm(CUBLAS_OP_N, @@ -110,7 +73,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten hidden_units_, // k attention_weights->query_weight.kernel, 3 * hidden_units_, // n - decoder_normed_input_, + attention_input, hidden_units_, // k qkv_buf_, 3 * hidden_units_ /* n */); @@ -160,38 +123,38 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten 0, // int8_mode stream_); sync_check_cuda_error(); - if (layer_id == 0) { - // shape: [B, H, L, Dh] - T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_); - T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_; - T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_; - cudaMemcpy(q_buf, - q_buf_2_, - sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_, - cudaMemcpyDeviceToHost); - sync_check_cuda_error(); - - for (int b = 0; b < request_batch_size; ++b) { - std::cout << "["; - for (int h = 0; h < head_num_; ++h) { - std::cout << "["; - for (int s = 0; s < request_seq_len; ++s) { - std::cout << "["; - for (int e = 0; e < 8; ++e) { - std::cout << k_buf[b * head_num_ * request_seq_len * size_per_head_ - + h * request_seq_len * size_per_head_ - + s * size_per_head_ - + e] - << " "; - } - std::cout << "]\n"; - } - std::cout << "]\n"; - } - std::cout << "]\n"; - } - std::cout << "\n"; - } + // if (layer_id == 0) { + // // shape: [B, H, L, Dh] + // T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_); + // T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_; + // T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_; + // cudaMemcpy(q_buf, + // q_buf_2_, + // sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_, + // cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < request_batch_size; ++b) { + // std::cout << "["; + // for (int h = 0; h < head_num_; ++h) { + // std::cout << "["; + // for (int s = 0; s < request_seq_len; ++s) { + // std::cout << "["; + // for (int e = 0; e < 8; ++e) { + // std::cout << v_buf[b * head_num_ * request_seq_len * size_per_head_ + // + h * request_seq_len * size_per_head_ + // + s * size_per_head_ + // + e] + // << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // Use batch major @@ -212,160 +175,211 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // v_cache = (batch_size, num_heads, L, Dh) sync_check_cuda_error(); - // TODO: fmha kernels doesn't support different seq lengths of q and kv + // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh) + + POP_RANGE; + if (attention_type == AttentionType::FUSED_MHA) { dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size); dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); } - // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh) - - POP_RANGE; - if (is_final == false) { + else { const cudaDataType_t gemm_data_type = getCudaDataType(); const int attention_seq_len_1 = request_seq_len; // q length const int attention_seq_len_2 = request_seq_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); - if (attention_type != AttentionType::FUSED_MHA) { - if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { - PUSH_RANGE("Q*K batch gemm"); - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, // n - attention_seq_len_1, // m - size_per_head_, // k - 1.0f, - k_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_2 * size_per_head_, // n * k - q_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_1 * size_per_head_, // m * k - 0.0f, - qk_buf_float_, - CUDA_R_32F, - attention_seq_len_2, // n - attention_seq_len_2 * attention_seq_len_1, - request_batch_size * head_num_, // global batch size - CUDA_R_32F); - - sync_check_cuda_error(); - POP_RANGE; - - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = request_batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } - else { - PUSH_RANGE("Q*K batch gemm"); - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, - attention_seq_len_1, - size_per_head_, - k_buf_2_, - size_per_head_, - attention_seq_len_2 * size_per_head_, - q_buf_2_, - size_per_head_, - attention_seq_len_1 * size_per_head_, - qk_buf_, - attention_seq_len_2, - attention_seq_len_2 * attention_seq_len_1, - request_batch_size * head_num_); - - POP_RANGE; - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = request_batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } - - PUSH_RANGE("QK*V batch gemm"); - - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + + // + // softmax(Q*K^T) + // + if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { + PUSH_RANGE("Q*K batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, // n + attention_seq_len_1, // m + size_per_head_, // k + 1.0f, + k_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_2 * size_per_head_, // n * k + q_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_1 * size_per_head_, // m * k + 0.0f, + qk_buf_float_, + CUDA_R_32F, + attention_seq_len_2, // n + attention_seq_len_2 * attention_seq_len_1, + request_batch_size * head_num_, // global batch size + CUDA_R_32F); + + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = request_batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; + } + else { + PUSH_RANGE("Q*K batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, - size_per_head_, - attention_seq_len_1, attention_seq_len_2, - v_buf_2_, + attention_seq_len_1, + size_per_head_, + k_buf_2_, size_per_head_, attention_seq_len_2 * size_per_head_, - qk_buf_, - attention_seq_len_2, - attention_seq_len_1 * attention_seq_len_2, - qkv_buf_2_, + q_buf_2_, size_per_head_, attention_seq_len_1 * size_per_head_, + qk_buf_, + attention_seq_len_2, + attention_seq_len_2 * attention_seq_len_1, request_batch_size * head_num_); - // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) - if (padding_offset == nullptr) { - invokeTransposeQKV(qkv_buf_3_, - qkv_buf_2_, - request_batch_size, - attention_seq_len_1, - head_num_, - size_per_head_, - attention_weights->attention_output_weight.scale, - 0, // int8_mode - stream_); - sync_check_cuda_error(); - } - else { - invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, - qkv_buf_3_, - m, - request_batch_size, - attention_seq_len_1, - head_num_, - size_per_head_, - padding_offset, - attention_weights->attention_output_weight.scale, - 0, // int8_mode - stream_); - } POP_RANGE; + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = request_batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; + } + + PUSH_RANGE("QK*V batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + size_per_head_, + attention_seq_len_1, + attention_seq_len_2, + v_buf_2_, + size_per_head_, + attention_seq_len_2 * size_per_head_, + qk_buf_, + attention_seq_len_2, + attention_seq_len_1 * attention_seq_len_2, + qkv_buf_2_, + size_per_head_, + attention_seq_len_1 * size_per_head_, + request_batch_size * head_num_); + + // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) + if (padding_offset == nullptr) { + invokeTransposeQKV(qkv_buf_3_, + qkv_buf_2_, + request_batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); + sync_check_cuda_error(); + } + else { + invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, + qkv_buf_3_, + m, + request_batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + padding_offset, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); } - sync_check_cuda_error(); - - PUSH_RANGE("proj gemm"); - - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - hidden_units_, - m, - hidden_units_, - attention_weights->attention_output_weight.kernel, - hidden_units_, - qkv_buf_3_, - hidden_units_, - attention_out, - hidden_units_); POP_RANGE; } + sync_check_cuda_error(); + + // if (layer_id == 0) { + // // shape: [B, L, H] + // T* qkv_buf = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_); + // cudaMemcpy(qkv_buf, + // qkv_buf_3_, + // sizeof(T) * request_batch_size * request_seq_len * hidden_units_, + // cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < request_batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < request_seq_len; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << qkv_buf[b * request_seq_len * hidden_units_ + // + s * hidden_units_ + // + h] + // << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } + + PUSH_RANGE("proj gemm"); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + hidden_units_, + m, + hidden_units_, + attention_weights->attention_output_weight.kernel, + hidden_units_, + qkv_buf_3_, + hidden_units_, + attention_out, + hidden_units_); + POP_RANGE; + // if (layer_id == 0) { + // // shape: [B, L, H] + // T* out = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_); + // cudaMemcpy(out, + // attention_out, + // sizeof(T) * request_batch_size * request_seq_len * hidden_units_, + // cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < request_batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < request_seq_len; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << out[b * request_seq_len * hidden_units_ + // + s * hidden_units_ + // + h] + // << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } if (is_free_buffer_after_forward_ == true) { freeBuffer(); @@ -501,8 +515,6 @@ void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true); k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; - decoder_normed_input_ = reinterpret_cast( - allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); // save memory usage when using fmha if (allocate_qk_buf) { @@ -537,7 +549,6 @@ void LLaMAContextAttentionLayer::freeBuffer() allocator_->free((void**)(&qk_buf_)); allocator_->free((void**)(&qkv_buf_2_)); allocator_->free((void**)(&qkv_buf_3_)); - allocator_->free((void**)(&decoder_normed_input_)); if (is_qk_buf_float_ == true) { allocator_->free((void**)(&qk_buf_float_)); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index e52fdc0a7..e9086e278 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -37,7 +37,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { const size_t hidden_units_; const size_t rotary_embedding_dim_; const bool neox_rotary_style_; - static constexpr float layernorm_eps_ = 1e-6f; // fmha runner int sm_ = getSMVersion(); @@ -72,7 +71,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { size_t mixed_gemm_ws_bytes_ = 0; char* int8_gemm_workspace_ = nullptr; size_t int8_gemm_ws_bytes_ = 0; - T* decoder_normed_input_ = nullptr; public: LLaMAContextAttentionLayer(size_t max_batch_size, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index c74dd4663..3b4bb56c6 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -83,6 +83,9 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_ context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); + output_logits_ = (T*)(allocator_->reMalloc( + output_logits_, sizeof(T) * batch_size * vocab_size_ * hidden_units_, false)); + is_allocate_buffer_ = true; } @@ -110,6 +113,7 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); + allocator_->free((void**)(&output_logits_)); is_allocate_buffer_ = false; } @@ -379,29 +383,49 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - // invokeGeneralLLaMALayerNorm( - // context_decoder_input_buf_, - // embedding_input_buf_, - // llama_weights->post_decoder_layernorm.gamma, - // llama_weights->post_decoder_layernorm.beta, - // layernorm_eps_, - // batch_size * max_input_length, - // hidden_units_, - // stream_); - // sync_check_cuda_error(); - // - // cublas_wrapper_->Gemm(CUBLAS_OP_N, - // CUBLAS_OP_N, - // batch_size * max_input_length, - // vocab_size_, - // hidden_units_, - // context_decoder_output_buf_, - // hidden_units_, // n - // llama_weights->post_decoder_embedding.kernel, - // vocab_size_, // k - // /* FIXME */, - // hidden_units_ /* n */); - // sync_check_cuda_error(); + if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { + invokeGeneralLLaMALayerNorm(context_decoder_input_buf_, + context_decoder_output_buf_, + llama_weights->post_decoder_layernorm.gamma, + llama_weights->post_decoder_layernorm.beta, + layernorm_eps_, + batch_size * max_input_length, + hidden_units_, + stream_); + sync_check_cuda_error(); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + vocab_size_, + batch_size * max_input_length, + hidden_units_, + llama_weights->post_decoder_embedding.kernel, + vocab_size_, + context_decoder_input_buf_, + hidden_units_, // n + output_logits_, + vocab_size_); + sync_check_cuda_error(); + + T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_); + cudaMemcpy(out, + output_logits_, + sizeof(T) * batch_size * max_input_length * vocab_size_, + cudaMemcpyDeviceToHost); + + for (int b = 0; b < batch_size; ++b) { + std::cout << "["; + for (int s = 0; s < max_input_length; ++s) { + std::cout << "["; + for (int v = 0; v < 8; ++v) { + std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " "; + } + std::cout << "]\n"; + } + std::cout << "]\n"; + } + std::cout << "\n"; + } + setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 386a09cd4..51d3d4dc0 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -82,6 +82,7 @@ class LLaMA: public BaseLayer { T* context_decoder_input_buf_; T* context_decoder_output_buf_; + T* output_logits_; // function pointer callback using callback_sig = void(std::unordered_map*, void*); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index c373c9d09..6a7857539 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -41,8 +41,8 @@ void LLaMAContextDecoder::initialize() false, 0); - ffn_layer_ = new GeluFfnLayer(0, // max_batch_size - 0, + ffn_layer_ = new SiluFfnLayer(0, // max_batch_size + 0, // max_seq_len head_num_, size_per_head_, 0, // expert_num @@ -52,8 +52,7 @@ void LLaMAContextDecoder::initialize() allocator_, is_free_buffer_after_forward_, false, - 0, - false // use_gated_activation = false + true // use_gated_activation = false ); } @@ -66,6 +65,8 @@ void LLaMAContextDecoder::allocateBuffer() template void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) { + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); self_attn_output_ = reinterpret_cast( allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); ffn_output_ = reinterpret_cast( @@ -83,6 +84,7 @@ template void LLaMAContextDecoder::freeBuffer() { if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); allocator_->free((void**)(&self_attn_output_)); allocator_->free((void**)(&ffn_output_)); allocator_->free((void**)(&decoder_layer_output_)); @@ -291,26 +293,43 @@ void LLaMAContextDecoder::forward(std::unordered_map* ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); } + invokeGeneralLLaMALayerNorm(decoder_normed_input_, + layer_input, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + stream_); + sync_check_cuda_error(); + // if (l == 0) { + // T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_); + // cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, + // cudaMemcpyDeviceToHost); sync_check_cuda_error(); + // + // for (int b = 0; b < h_token_num; ++b) { + // std::cout << "["; + // int i = 0; + // for (int h = 0; h < hidden_units_; ++h) { + // std::cout << out[b * hidden_units_ + h] << " "; + // ++i; + // if (i == 8) + // break; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } + TensorMap self_attention_input_tensors{ - {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}, + {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, {"attention_mask", Tensor{MEMORY_GPU, data_type, {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, attention_mask}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, - {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, - {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, - {"pre_layernorm_weights_gamma", - Tensor{MEMORY_GPU, - data_type, - {(size_t)hidden_units_}, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma}}, - {"pre_layernorm_weights_beta", - Tensor{MEMORY_GPU, - data_type, - {(size_t)hidden_units_}, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta}}}; + {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", @@ -332,68 +351,134 @@ void LLaMAContextDecoder::forward(std::unordered_map* self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); + // if (l == 0) { + // // shape: [B, L, H] + // T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); + // cudaMemcpy( + // out, self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, + // cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < seq_len; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } + + invokeGeneralLLaMAAddBiasResidualPreLayerNorm( + self_attn_output_, + layer_input, + self_attn_output_, + layer_input, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + h_token_num, + hidden_units_, + stream_); + sync_check_cuda_error(); + + // if (l == 0) { + // // shape: [B, L, H] + // T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); + // cudaMemcpy( + // out, layer_input, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); + // sync_check_cuda_error(); + // + // for (int b = 0; b < batch_size; ++b) { + // std::cout << "["; + // for (int s = 0; s < seq_len; ++s) { + // std::cout << "["; + // for (int h = 0; h < 8; ++h) { + // std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; + // } + // std::cout << "]\n"; + // } + // std::cout << "]\n"; + // } + // std::cout << "\n"; + // } + + TensorMap ffn_input_tensors( + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}}); + TensorMap ffn_output_tensors( + {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); + ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); + + invokeAddBiasResidual(layer_output, + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); + + sync_check_cuda_error(); + +// if (l == 0) { +// // shape: [B, L, H] +// T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); +// cudaMemcpy(out, layer_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); +// sync_check_cuda_error(); +// +// for (int b = 0; b < batch_size; ++b) { +// std::cout << "["; +// for (int s = 0; s < seq_len; ++s) { +// std::cout << "["; +// for (int h = 0; h < 8; ++h) { +// std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; +// } +// std::cout << "]\n"; +// } +// std::cout << "]\n"; +// } +// std::cout << "\n"; +// } + + if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_; + ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); + } - if (is_final == false) { - invokeGeneralAddBiasResidualPreLayerNorm( - self_attn_output_, - layer_input, - self_attn_output_, - layer_input, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, - llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, - layernorm_eps_, - h_token_num, - hidden_units_, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - (float*)nullptr, - 0, - stream_); - - TensorMap ffn_input_tensors( - {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}}); - TensorMap ffn_output_tensors( - {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); - ffn_layer_->forward( - &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); - - invokeAddBiasResidual(layer_output, - self_attn_output_, - llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, - hidden_units_, - stream_); - - sync_check_cuda_error(); - - if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 - && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_; - ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); - } - - if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output, - decoder_layer_output_, - padding_offset_, - h_token_num, - head_num_ * size_per_head_, - stream_); - } + if ((l == num_layer_ - 1) && is_unpadded_mha) { + invokeRebuildPadding(decoder_output, + decoder_layer_output_, + padding_offset_, + h_token_num, + head_num_ * size_per_head_, + stream_); } } - // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer - invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), - output_tensors->at("decoder_output").getPtr(), - input_tensors->at("input_lengths").getPtr(), - seq_len, - batch_size, - hidden_units_, - stream_); - sync_check_cuda_error(); +// if (pipeline_para_.rank_ == pipeline_para_.world_size_ -1) { +// // shape: [B, L, H] +// T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); +// cudaMemcpy(out, decoder_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); +// sync_check_cuda_error(); +// +// for (int b = 0; b < batch_size; ++b) { +// std::cout << "["; +// for (int s = 0; s < seq_len; ++s) { +// std::cout << "["; +// for (int h = 0; h < 8; ++h) { +// std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; +// } +// std::cout << "]\n"; +// } +// std::cout << "]\n"; +// } +// std::cout << "\n"; +// } + if (is_free_buffer_after_forward_ == true) { freeBuffer(); } diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc index ff2ec11be..6f3a7721f 100644 --- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -162,12 +162,12 @@ void LLaMADecoderLayerWeight::setWeightPtr() self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; self_attention_weights.attention_output_weight.bias = weights_ptr[5]; - ffn_weights.intermediate_weight.kernel = weights_ptr[6]; - ffn_weights.intermediate_weight.bias = weights_ptr[7]; - ffn_weights.output_weight.kernel = weights_ptr[8]; - ffn_weights.output_weight.bias = weights_ptr[9]; - ffn_weights.gating_weight.kernel = weights_ptr[10]; - ffn_weights.gating_weight.bias = weights_ptr[11]; + ffn_weights.intermediate_weight.kernel = weights_ptr[6]; + ffn_weights.intermediate_weight.bias = weights_ptr[7]; + ffn_weights.output_weight.kernel = weights_ptr[8]; + ffn_weights.output_weight.bias = weights_ptr[9]; + ffn_weights.intermediate_weight2.kernel = weights_ptr[10]; + ffn_weights.intermediate_weight2.bias = weights_ptr[11]; post_attention_layernorm_weights.beta = weights_ptr[12]; post_attention_layernorm_weights.gamma = weights_ptr[13]; From 4434e65e742474d91acba0b00d73b82822497f66 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 16 Sep 2023 20:27:32 +0000 Subject: [PATCH 15/55] remove debugging code print --- src/fastertransformer/models/llama/LLaMA.cc | 36 ++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 3b4bb56c6..6285b804b 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -406,24 +406,24 @@ void LLaMA::forward(std::unordered_map* output_ten vocab_size_); sync_check_cuda_error(); - T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_); - cudaMemcpy(out, - output_logits_, - sizeof(T) * batch_size * max_input_length * vocab_size_, - cudaMemcpyDeviceToHost); - - for (int b = 0; b < batch_size; ++b) { - std::cout << "["; - for (int s = 0; s < max_input_length; ++s) { - std::cout << "["; - for (int v = 0; v < 8; ++v) { - std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " "; - } - std::cout << "]\n"; - } - std::cout << "]\n"; - } - std::cout << "\n"; +// T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_); +// cudaMemcpy(out, +// output_logits_, +// sizeof(T) * batch_size * max_input_length * vocab_size_, +// cudaMemcpyDeviceToHost); +// +// for (int b = 0; b < batch_size; ++b) { +// std::cout << "["; +// for (int s = 0; s < max_input_length; ++s) { +// std::cout << "["; +// for (int v = 0; v < 8; ++v) { +// std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " "; +// } +// std::cout << "]\n"; +// } +// std::cout << "]\n"; +// } +// std::cout << "\n"; } From 95a7efe0b69a872f671b2dd9fd7d4453feae7e5f Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 16 Sep 2023 20:55:50 +0000 Subject: [PATCH 16/55] remove debugging code --- examples/cpp/llama/llama_example.cc | 49 +++--- .../LLaMAContextAttentionLayer.cc | 103 ------------- src/fastertransformer/models/llama/LLaMA.cc | 139 ++---------------- src/fastertransformer/models/llama/LLaMA.h | 5 - .../models/llama/LLaMAContextDecoder.cc | 100 ------------- 5 files changed, 44 insertions(+), 352 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 43f55c4b7..a558bbf65 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -213,10 +213,8 @@ void llama_example(const INIReader reader) &prop, attention_type); - int* d_output_ids; - int* d_sequence_lengths; - deviceMalloc(&d_output_ids, request_batch_size * total_output_len, false); - deviceMalloc(&d_sequence_lengths, request_batch_size, false); + T* d_output_logits; + deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false); std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", @@ -226,18 +224,14 @@ void llama_example(const INIReader reader) Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}, - {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &max_cache_seq_len}} - }; + {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &max_cache_seq_len}}}; std::unordered_map output_tensors = std::unordered_map{ - {"output_ids", + {"output_logits", Tensor{MEMORY_GPU, - TYPE_INT32, - std::vector{request_batch_size, 1, (size_t)total_output_len}, - d_output_ids}}, - {"sequence_length", - Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_sequence_lengths}}, - }; + TYPE_FP16, + std::vector{request_batch_size, (size_t)total_output_len, vocab_size}, + d_output_logits}}}; print_mem_usage(); @@ -259,6 +253,25 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); +// if (rank == world_size-1) { +// T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); +// cudaMemcpy( +// out, d_output_logits, sizeof(T) * request_batch_size * total_output_len * vocab_size, cudaMemcpyDeviceToHost); +// for (int b = 0; b < request_batch_size; ++b) { +// std::cout << "["; +// for (int s = 0; s < total_output_len; ++s) { +// std::cout << "["; +// for (int v = vocab_size-8; v < vocab_size; ++v) { +// std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " "; +// } +// std::cout << "]\n"; +// } +// std::cout << "]\n"; +// } +// std::cout << "\n"; +// } + + /* if (rank == 0) { std::string fName = "out"; @@ -269,7 +282,7 @@ void llama_example(const INIReader reader) else { size_t outCount = total_output_len * request_batch_size; int* hBuf = new int[outCount]; - cudaD2Hcpy(hBuf, d_output_ids, outCount); + cudaD2Hcpy(hBuf, d_output_logits, outCount); { std::cout << "Writing " << outCount << " elements\n"; @@ -295,6 +308,7 @@ void llama_example(const INIReader reader) delete[] hBuf; } } + */ // test time struct timeval start, end; @@ -339,11 +353,8 @@ void llama_example(const INIReader reader) if (d_input_lengths != nullptr) { cudaFree(d_input_lengths); } - if (d_output_ids != nullptr) { - deviceFree(d_output_ids); - } - if (d_sequence_lengths != nullptr) { - deviceFree(d_sequence_lengths); + if (d_output_logits != nullptr) { + deviceFree(d_output_logits); } return; diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 70e638150..10e39fd39 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -78,26 +78,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); - // if (layer_id == 0) { - // T* qkv_buf = (T*)malloc(sizeof(T) * m * 3 * hidden_units_); - // cudaMemcpy(qkv_buf, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < request_batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < request_seq_len; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << qkv_buf[((b * request_seq_len) + s) * 3 * hidden_units_ + h + 2 * hidden_units_] - // << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } - // IDEA: append prefix prompt key value here PrefixPromptBatchWeightsParam param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_}; @@ -123,38 +103,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten 0, // int8_mode stream_); sync_check_cuda_error(); - // if (layer_id == 0) { - // // shape: [B, H, L, Dh] - // T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_); - // T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_; - // T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_; - // cudaMemcpy(q_buf, - // q_buf_2_, - // sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_, - // cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < request_batch_size; ++b) { - // std::cout << "["; - // for (int h = 0; h < head_num_; ++h) { - // std::cout << "["; - // for (int s = 0; s < request_seq_len; ++s) { - // std::cout << "["; - // for (int e = 0; e < 8; ++e) { - // std::cout << v_buf[b * head_num_ * request_seq_len * size_per_head_ - // + h * request_seq_len * size_per_head_ - // + s * size_per_head_ - // + e] - // << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // Use batch major @@ -316,32 +264,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } sync_check_cuda_error(); - // if (layer_id == 0) { - // // shape: [B, L, H] - // T* qkv_buf = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_); - // cudaMemcpy(qkv_buf, - // qkv_buf_3_, - // sizeof(T) * request_batch_size * request_seq_len * hidden_units_, - // cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < request_batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < request_seq_len; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << qkv_buf[b * request_seq_len * hidden_units_ - // + s * hidden_units_ - // + h] - // << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } - PUSH_RANGE("proj gemm"); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, @@ -355,31 +277,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten attention_out, hidden_units_); POP_RANGE; - // if (layer_id == 0) { - // // shape: [B, L, H] - // T* out = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_); - // cudaMemcpy(out, - // attention_out, - // sizeof(T) * request_batch_size * request_seq_len * hidden_units_, - // cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < request_batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < request_seq_len; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << out[b * request_seq_len * hidden_units_ - // + s * hidden_units_ - // + h] - // << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } if (is_free_buffer_after_forward_ == true) { freeBuffer(); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 6285b804b..02d46b5b9 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -83,9 +83,6 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_ context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); - output_logits_ = (T*)(allocator_->reMalloc( - output_logits_, sizeof(T) * batch_size * vocab_size_ * hidden_units_, false)); - is_allocate_buffer_ = true; } @@ -113,7 +110,6 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); - allocator_->free((void**)(&output_logits_)); is_allocate_buffer_ = false; } @@ -245,27 +241,15 @@ void LLaMA::forward(std::unordered_map* output_ten // max_cache_seq_len [batch_size] on cpu // output_tensors: - // output_ids [batch_size, 1, max_output_seq_len] - // sequence_length [batch_size] - - // Step is from max_input_length ~ max_output_seq_len, - // When step = k, we put output ids and caches at step k, and the sequence_length would be k - 1 before - // complete this step. - // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy - // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data + // output_logits [batch_size, max_output_seq_len, vocab_size] FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3"); - FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end() && input_tensors->at("output_seq_len").shape.size() == 1); - FT_CHECK(output_tensors->at("output_ids").shape.size() == 3); - FT_CHECK(output_tensors->at("sequence_length").shape.size() == 1); - FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0], - "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]"); - const size_t batch_size = output_tensors->at("output_ids").shape[0]; + const size_t batch_size = input_tensors->at("input_ids").shape[0]; // NOTE: Prefix Prompt PreProcessing // get prefix_prompt_weight for each batch --> shape [batch, 1] @@ -332,27 +316,6 @@ void LLaMA::forward(std::unordered_map* output_ten hidden_units_, stream_); sync_check_cuda_error(); - // if (pipeline_para_.rank_ == 0) { - // T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_); - // cudaMemcpy(out, - // context_decoder_input_buf_, - // sizeof(T) * batch_size * max_input_length * hidden_units_, - // cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < max_input_length; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << out[b * batch_size * hidden_units_ + s * hidden_units_ + h] << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } invokeBuildDecoderAttentionMask( input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_); @@ -384,6 +347,7 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { + T* output_logits = output_tensors->at("output_logits").getPtr(); invokeGeneralLLaMALayerNorm(context_decoder_input_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, @@ -394,41 +358,20 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - vocab_size_, - batch_size * max_input_length, - hidden_units_, - llama_weights->post_decoder_embedding.kernel, - vocab_size_, - context_decoder_input_buf_, - hidden_units_, // n - output_logits_, - vocab_size_); + CUBLAS_OP_N, + vocab_size_, + batch_size * max_input_length, + hidden_units_, + llama_weights->post_decoder_embedding.kernel, + vocab_size_, + context_decoder_input_buf_, + hidden_units_, // n + output_logits, + vocab_size_); sync_check_cuda_error(); - -// T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_); -// cudaMemcpy(out, -// output_logits_, -// sizeof(T) * batch_size * max_input_length * vocab_size_, -// cudaMemcpyDeviceToHost); -// -// for (int b = 0; b < batch_size; ++b) { -// std::cout << "["; -// for (int s = 0; s < max_input_length; ++s) { -// std::cout << "["; -// for (int v = 0; v < 8; ++v) { -// std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " "; -// } -// std::cout << "]\n"; -// } -// std::cout << "]\n"; -// } -// std::cout << "\n"; } - - setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); - sendTensorsToFirstPipelineNode(output_tensors, input_tensors); + // sendTensorsToFirstPipelineNode(output_tensors, input_tensors); } template @@ -467,60 +410,6 @@ void LLaMA::sendTensorsToFirstPipelineNode(std::unordered_map -void LLaMA::setOutputTensors(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const size_t max_input_length, - const size_t max_output_seq_len) -{ - FT_LOG_DEBUG(__PRETTY_FUNCTION__); - if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) { - return; - } - - const size_t batch_size = output_tensors->at("output_ids").shape[0]; - uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); - - if (input_tensors->at("input_ids").shape[1] == 0) { - invokeCudaD2DcpyConvert( - sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); - // TODO: D2D sequence_lenghts - // For sampling, only copy the results to output_tensor - invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), - output_ids_buf_ + batch_size, - max_output_seq_len - 1, - batch_size, - 1, - stream_); - } - else { - - // For sampling, it is equivalent to all parent ids are 0. - gatherTreeParam param; - param.beams = transposed_output_ids_buf_; - param.max_sequence_lengths = sequence_lengths_; - // add sequence_length 1 here because the sequence_length of time step t is t - 1 - param.max_sequence_length_final_step = 1; - param.max_time = max_output_seq_len; - param.batch_size = batch_size; - param.beam_width = 1; - param.step_ids = output_ids_buf_; - param.parent_ids = nullptr; - param.end_tokens = end_ids_buf_; - param.max_input_length = max_input_length; - param.prefix_soft_prompt_lengths = nullptr; - param.input_lengths = tiled_input_lengths_buf_; - param.max_prefix_soft_prompt_length = 0; - param.max_input_without_prompt_length = max_input_length; - param.stream = stream_; - param.output_ids = output_tensors->at("output_ids").getPtr(); - invokeGatherTree(param); - invokeCudaD2DcpyConvert( - sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); - sync_check_cuda_error(); - } -} - template size_t LLaMA::getPipelineParallelRank() { diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 51d3d4dc0..3b7995927 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -82,17 +82,12 @@ class LLaMA: public BaseLayer { T* context_decoder_input_buf_; T* context_decoder_output_buf_; - T* output_logits_; // function pointer callback using callback_sig = void(std::unordered_map*, void*); callback_sig* token_generated_cb_ = nullptr; void* token_generated_ctx_ = nullptr; - void setOutputTensors(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors, - const size_t max_input_length, - const size_t max_seq_len); void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 6a7857539..49af917de 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -302,24 +302,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* hidden_units_, stream_); sync_check_cuda_error(); - // if (l == 0) { - // T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_); - // cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, - // cudaMemcpyDeviceToHost); sync_check_cuda_error(); - // - // for (int b = 0; b < h_token_num; ++b) { - // std::cout << "["; - // int i = 0; - // for (int h = 0; h < hidden_units_; ++h) { - // std::cout << out[b * hidden_units_ + h] << " "; - // ++i; - // if (i == 8) - // break; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, @@ -351,27 +333,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - // if (l == 0) { - // // shape: [B, L, H] - // T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); - // cudaMemcpy( - // out, self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, - // cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < seq_len; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, @@ -387,27 +348,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - // if (l == 0) { - // // shape: [B, L, H] - // T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); - // cudaMemcpy( - // out, layer_input, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); - // sync_check_cuda_error(); - // - // for (int b = 0; b < batch_size; ++b) { - // std::cout << "["; - // for (int s = 0; s < seq_len; ++s) { - // std::cout << "["; - // for (int h = 0; h < 8; ++h) { - // std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; - // } - // std::cout << "]\n"; - // } - // std::cout << "]\n"; - // } - // std::cout << "\n"; - // } - TensorMap ffn_input_tensors( {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}}); TensorMap ffn_output_tensors( @@ -423,26 +363,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); -// if (l == 0) { -// // shape: [B, L, H] -// T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); -// cudaMemcpy(out, layer_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); -// sync_check_cuda_error(); -// -// for (int b = 0; b < batch_size; ++b) { -// std::cout << "["; -// for (int s = 0; s < seq_len; ++s) { -// std::cout << "["; -// for (int h = 0; h < 8; ++h) { -// std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; -// } -// std::cout << "]\n"; -// } -// std::cout << "]\n"; -// } -// std::cout << "\n"; -// } - if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; @@ -459,26 +379,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* } } -// if (pipeline_para_.rank_ == pipeline_para_.world_size_ -1) { -// // shape: [B, L, H] -// T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_); -// cudaMemcpy(out, decoder_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost); -// sync_check_cuda_error(); -// -// for (int b = 0; b < batch_size; ++b) { -// std::cout << "["; -// for (int s = 0; s < seq_len; ++s) { -// std::cout << "["; -// for (int h = 0; h < 8; ++h) { -// std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " "; -// } -// std::cout << "]\n"; -// } -// std::cout << "]\n"; -// } -// std::cout << "\n"; -// } - if (is_free_buffer_after_forward_ == true) { freeBuffer(); } From 0a0015d61fcf9fed413a491f870b7e2b1e88eba5 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 18 Sep 2023 04:05:24 +0000 Subject: [PATCH 17/55] LLaMA Constructor fix --- src/fastertransformer/th_op/CMakeLists.txt | 3 + src/fastertransformer/th_op/llama/LLaMA.h | 97 +++++++++------------- 2 files changed, 44 insertions(+), 56 deletions(-) diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt index b9f2b9151..4e8d82d30 100644 --- a/src/fastertransformer/th_op/CMakeLists.txt +++ b/src/fastertransformer/th_op/CMakeLists.txt @@ -32,6 +32,7 @@ add_subdirectory(t5) add_subdirectory(bart) add_subdirectory(bert) add_subdirectory(deberta) +add_subdirectory(llama) add_library(th_transformer SHARED $ @@ -49,6 +50,7 @@ add_library(th_transformer SHARED $ $ $ + $ ) target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}" th_bart @@ -66,6 +68,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}" th_t5 th_utils th_vit + th_llama ) if(ENABLE_FP8) diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 1aac8a7d7..9a5efa3d0 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -50,18 +50,18 @@ template class FTLLaMA: public IFLLaMA { public: FTLLaMA(const size_t head_num, - const size_t size_per_head, - const size_t inter_size, - const size_t layer_num, - const size_t vocab_size, - const size_t rotary_embedding_dim, - const int start_id, - const int end_id, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, - const size_t max_seq_len, - const bool use_gptj_residual, - const vector weights): + const size_t size_per_head, + const size_t inter_size, + const size_t layer_num, + const size_t vocab_size, + const size_t rotary_embedding_dim, + const int start_id, + const int end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const size_t max_seq_len, + const bool use_gptj_residual, + const vector weights): head_num_(head_num), size_per_head_(size_per_head), inter_size_(inter_size), @@ -114,7 +114,7 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); - llama_weights_.setMaxSeqLen(max_seq_len); + //llama_weights_.setMaxSeqLen(max_seq_len); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); } @@ -172,35 +172,20 @@ class FTLLaMA: public IFLLaMA { false, // with_relative_position_bias true); // causal_mask - ft::LLaMA llama = ft::LLaMA(head_num_, - size_per_head_, - inter_size_, - layer_num_, - vocab_size_, - rotary_embedding_dim_, - start_id_, - end_id_, - end_id_ + 1, // p/prompt tuning virtual token start id - ft::PromptLearningType::no_prompt, - use_gptj_residual_, - 0.0f, // beam_search_diversity_rate, - 1, // top_k, - 0.0, // top_p, - 0, // random_seed, - 1.0f, // temperature, - 1.0f, // len_penalty, - 1.0f, // repetition_penalty, - tensor_para_, - pipeline_para_, - stream, - &cublas_wrapper, - &allocator, - false, // is_free_buffer_after_forward - &prop_, // cuda_device_prop - attention_type, // attention_type - nullptr, // custom_all_reduce_comm - 0); // enable_custom_all_reduce - + ft::LLaMA llama = ft::LLaMA(head_num_, + size_per_head_, + inter_size_, + layer_num_, + vocab_size_, + rotary_embedding_dim_, + 0, // random_seed, + stream, + &cublas_wrapper, + &allocator, + false, // is_free_buffer_after_forward + &prop_, // cuda_device_prop + attention_type // attention_type + ); std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ @@ -297,7 +282,7 @@ class FTLLaMA: public IFLLaMA { std::mutex* cublas_wrapper_mutex_; ft::cublasAlgoMap* cublas_algo_map_; struct cudaDeviceProp prop_; - ft::LLaMAWeight llama_weights_; + ft::LLaMAWeight llama_weights_; ft::NcclParam tensor_para_; ft::NcclParam pipeline_para_; @@ -309,18 +294,18 @@ class FTLLaMA: public IFLLaMA { class LLaMA: public th::jit::CustomClassHolder { public: LLaMA(const int64_t head_num, - const int64_t size_per_head, - const int64_t inter_size, - const int64_t layer_num, - const int64_t vocab_size, - const int64_t rotary_embedding_dim, - const int64_t start_id, - const int64_t end_id, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, - const int64_t max_seq_len, - const bool use_gptj_residual, - const vector weights); + const int64_t size_per_head, + const int64_t inter_size, + const int64_t layer_num, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t start_id, + const int64_t end_id, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const int64_t max_seq_len, + const bool use_gptj_residual, + const vector weights); ~LLaMA(); @@ -339,7 +324,7 @@ class LLaMA: public th::jit::CustomClassHolder { private: const at::ScalarType st_; - IFLLaMA* ftllama; + IFLLaMA* ftllama; std::vector weights; }; From 6ed374791f932f61ee2072d1d6cc623f2b066810 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 18 Sep 2023 09:11:32 +0000 Subject: [PATCH 18/55] llama-opt --- examples/cpp/llama/backup.csv | 32 ++++++ examples/cpp/llama/llama_config.ini | 3 +- examples/cpp/llama/llama_example.cc | 107 ++++++++---------- examples/cpp/llama/start_ids.csv | 36 +++++- .../LLaMAContextAttentionLayer.cc | 62 +++------- .../LLaMAContextAttentionLayer.h | 22 +--- src/fastertransformer/models/llama/LLaMA.cc | 69 +++-------- src/fastertransformer/models/llama/LLaMA.h | 95 ++++++---------- .../models/llama/LLaMAContextDecoder.cc | 21 +--- .../models/llama/LLaMAContextDecoder.h | 9 +- 10 files changed, 187 insertions(+), 269 deletions(-) create mode 100644 examples/cpp/llama/backup.csv diff --git a/examples/cpp/llama/backup.csv b/examples/cpp/llama/backup.csv new file mode 100644 index 000000000..eb28ed345 --- /dev/null +++ b/examples/cpp/llama/backup.csv @@ -0,0 +1,32 @@ +1, 14542, 3262, 8112, 29901, 7803, 1757, 526, 13407, 297, 263, 13569, 29889, 2688 +1, 7392, 1026, 29901, 319, 11379, 15028, 297, 263, 17948, 8693, 29889, 450, 11379 +1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023 +1, 17984, 18558, 29901, 1334, 1074, 263, 767, 13407, 297, 263, 5716, 29889, 450, 767 +1, 19509, 263, 1766, 25206, 29901, 11647, 526, 2734, 373, 278, 11952, 29889, 319, 767 +1, 7412, 292, 11565, 29901, 11647, 526, 16246, 5742, 6131, 13587, 3949, 18464, 29889, 11647 +1, 7412, 292, 10311, 26082, 29901, 319, 767, 338, 16246, 2768, 263, 5716, 29889, 940 +1, 323, 4524, 29901, 319, 767, 322, 6114, 526, 16246, 373, 263, 7408, 4208, 29889, 2688 +1, 8565, 375, 3183, 29901, 319, 767, 338, 13407, 701, 322, 8026, 263, 7679, 29889, 11647 +1, 12878, 292, 278, 11203, 29901, 319, 6114, 17042, 2039, 714, 11480, 278, 17455, 29889, 7803, 2319, 26361 +1, 5057, 12500, 29901, 319, 6114, 338, 22049, 3412, 263, 5702, 29889, 2296 +1, 8565, 11203, 29901, 319, 2919, 19174, 338, 22229, 2820, 263, 1746, 29889, 11647 +1, 8360, 5367, 29901, 319, 6114, 338, 409, 630, 472, 263, 1591, 29889, 2296 +1, 476, 484, 14067, 29901, 319, 767, 17905, 1379, 373, 263, 17132, 29889, 450, 767 +1, 7412, 292, 378, 25496, 29901, 319, 767, 338, 16246, 5742, 1023, 28987, 29889, 940 +1, 1706, 262, 1076, 29901, 319, 767, 338, 16246, 373, 385, 15058, 4768, 446, 29889, 940 +1, 390, 5086, 11308, 29901, 319, 767, 338, 1153, 9292, 11308, 297, 263, 29413, 29889, 940 +1, 7412, 292, 11210, 336, 29901, 319, 767, 715, 16926, 263, 21387, 964, 670, 11210, 29889, 940 +1, 7412, 292, 11210, 336, 29901, 319, 4123, 767, 269, 1169, 373, 263, 6592, 29889, 450, 767 +1, 28551, 292, 29901, 11647, 526, 13407, 373, 263, 17306, 310, 15007, 29889, 11647 +1, 8481, 24613, 1847, 29901, 319, 767, 338, 13407, 373, 263, 19587, 11952, 29889, 940 +1, 7412, 292, 1248, 29877, 29901, 11647, 526, 2381, 25217, 297, 278, 4094, 29889, 7803, 5866 +1, 5057, 12500, 29901, 319, 767, 15028, 297, 278, 7256, 310, 263, 10728, 1974, 29889, 29445, 6289 +1, 5057, 12500, 29901, 319, 767, 338, 4318, 2734, 1623, 263, 5702, 29889, 940 +1, 6781, 8522, 29901, 11647, 526, 13407, 373, 263, 1746, 9963, 29889, 450, 1757 +1, 8565, 11203, 29901, 319, 6114, 338, 8743, 411, 263, 11203, 29889, 450, 11203 +1, 3925, 25217, 29901, 319, 2381, 25217, 11565, 338, 4318, 297, 263, 5716, 29889, 7567 +1, 6163, 23131, 292, 29901, 12753, 2305, 748, 23131, 292, 1623, 263, 10952, 29889, 2688 +1, 399, 336, 3262, 22981, 29901, 1334, 1074, 263, 3800, 373, 18187, 29889, 319, 2022 +1, 28551, 292, 29901, 319, 2022, 338, 14993, 292, 1623, 263, 17306, 310, 15007, 29889, 2688 +1, 399, 1161, 292, 3700, 29901, 319, 6114, 338, 13407, 297, 263, 5716, 9963, 29889, 2296 +1, 2522, 11495, 1933, 292, 29901, 319, 767, 338, 1090, 4094, 297, 263, 11565, 29889, 940 diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 1e92695e5..931b24e5d 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -6,8 +6,7 @@ pipeline_para_size=4 [request] -beam_width=1 # beam width for beam search -request_batch_size=4 # determine by the request +request_batch_size=32 [llama_33B] head_num=52 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index a558bbf65..d2c8dcf51 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -189,13 +189,30 @@ void llama_example(const INIReader reader) mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD); } - AttentionType attention_type = getAttentionType(size_per_head, - getSMVersion(), - true, // remove_padding - 0, // llama supports any-seq-length fmha - true, // is_fuse - false, // with_relative_position_bias - true); // causal_mask + AttentionType attention_type = + getAttentionType(size_per_head, + getSMVersion(), + !((std::getenv("SHONG_PADDING") != nullptr) + && (std::string(std::getenv("SHONG_PADDING")) == "ON")), //true, // remove_padding + 0, // llama supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + switch (attention_type) { + case AttentionType::UNFUSED_MHA: + std::cout << "UNFUSED_MHA\n"; + break; + case AttentionType::UNFUSED_PADDED_MHA: + std::cout << "UNFUSED_PADDED_MHA\n"; + break; + case AttentionType::FUSED_MHA: + std::cout << "FUSED_MHA\n"; + break; + case AttentionType::FUSED_PADDED_MHA: + std::cout << "FUSED_PADDED_MHA\n"; + break; + } LLaMA llama = LLaMA(head_num, size_per_head, @@ -239,7 +256,6 @@ void llama_example(const INIReader reader) cudaDeviceSynchronize(); mpi::barrier(); - cudaProfilerStart(); // warm up ite = 1; ft_nvtx::setScope("warmup_time"); @@ -253,71 +269,39 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); -// if (rank == world_size-1) { -// T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); -// cudaMemcpy( -// out, d_output_logits, sizeof(T) * request_batch_size * total_output_len * vocab_size, cudaMemcpyDeviceToHost); -// for (int b = 0; b < request_batch_size; ++b) { -// std::cout << "["; -// for (int s = 0; s < total_output_len; ++s) { -// std::cout << "["; -// for (int v = vocab_size-8; v < vocab_size; ++v) { -// std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " "; -// } -// std::cout << "]\n"; -// } -// std::cout << "]\n"; -// } -// std::cout << "\n"; -// } - - /* - if (rank == 0) { - - std::string fName = "out"; - auto outFile = std::ofstream(fName, std::ios::out); - if (!outFile.is_open()) { - printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); - } - else { - size_t outCount = total_output_len * request_batch_size; - int* hBuf = new int[outCount]; - cudaD2Hcpy(hBuf, d_output_logits, outCount); - - { - std::cout << "Writing " << outCount << " elements\n"; - int zeroCount = 0; - for (size_t i = 0; i < outCount; i++) { - if (hBuf[i] == int(0)) { - zeroCount++; - } - outFile << hBuf[i] << " "; - if ((i + 1) % (total_output_len) == 0) { - outFile << std::endl; - } - - if (i < 10) { - printf("%5d ", hBuf[i]); - } - if ((i + 1) % (total_output_len) == 0 && i < 10) { - std::cout << std::endl; - } + if (rank == world_size - 1) { + T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); + cudaMemcpy(out, + d_output_logits, + sizeof(T) * request_batch_size * total_output_len * vocab_size, + cudaMemcpyDeviceToHost); + for (int b = 0; b < request_batch_size; ++b) { + std::cout << "["; + for (int s = 0; s < total_output_len; ++s) { + std::cout << "["; + for (int v = vocab_size - 8; v < vocab_size; ++v) { + std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " "; } - std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; + std::cout << "]\n"; } - delete[] hBuf; + std::cout << "]\n"; } + std::cout << "\n"; + free(out); } - */ // test time + cudaProfilerStart(); struct timeval start, end; - mpi::barrier(); cudaDeviceSynchronize(); + mpi::barrier(); + gettimeofday(&start, NULL); ft_nvtx::setScope("total_time"); PUSH_RANGE("total time") + // warm up + ite = 3; for (int i = 0; i < ite; ++i) { llama.forward(&output_tensors, &input_tensors, &llama_weights); } @@ -328,7 +312,6 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); gettimeofday(&end, NULL); - cudaProfilerStop(); printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d" diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index a74083153..58bc4b4f6 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1,4 +1,32 @@ -1, 14542, 3262, 8112, 29901, 7803, 1757, 526, 13407, 297, 263, 13569, 29889, 2688, 526, 13587, 701, 27815, 29889, 0 -1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023, 6057, 964, 263, 1559, 29889, 0 -1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023, 4947, 297, 263, 1775, 29889, 0 -1, 28551, 292, 29901, 11647, 526, 13407, 373, 263, 17306, 310, 15007, 29889, 11647, 526, 1985, 2768, 263, 5214, 29889 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 432, 809, 295, 719, 27372, 29889, 7605, 263, 286, 789, 5941, 292, 10823, 363, 596, 3632, 331, 1943, 432, 809, 295, 719, 27372, 9522, 412, 29889, 360, 728, 29559, 411, 1395, 559, 29899, 7582, 1259, 4426, 508, 367, 1304, 29892, 541, 14383, 270, 728, 29559, 411, 4023, 845, 9418, 29899, 29890, 5761, 616, 4426, 408, 445, 508, 17820, 278, 8341, 1283, 432, 809, 295, 719, 28001 , 29889 +1, 3201, 955, 29901, 1128, 304, 679, 304, 413, 4442, 340, 29889, 315, 905, 263, 1513, 16286, 304, 413, 4442, 2165, 4799, 637, 313, 29926, 6547, 3300, 352, 13607, 6121, 4799, 637, 29897, 515, 29129, 1450, 470, 4655, 14721, 273, 14368, 29892, 1316, 408, 286, 348, 436, 29892, 301, 898, 265, 29892, 1226, 23559, 29892, 10395, 2429, 29892, 282, 1431, 434, 29892, 610, 275, 470, 7655, 1915, 29889, 1704, 26536, 3160, 3287, 1248, 728, 4799, 9012 +1, 8778, 322, 19906, 29901, 1128, 304, 12566, 330, 2390, 267, 515, 17564, 29879, 29889, 26428, 596, 2646, 412, 325, 1475, 411, 2691, 27716, 7787, 1259, 304, 12566, 278, 330, 2390, 267, 29889, 450, 27716, 881, 367, 1546, 29871, 29900, 29889, 29945, 304, 29871, 29900, 29889, 29947, 3533, 17528, 690, 313, 29900, 29889, 29900, 29906, 29900, 304, 29871, 29900, 29889, 29900, 29941, 29896, 297, 29897, 304, 12566, 278, 330, 2390, 267, 515, 285, 3687, 29892, 25550 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 6548, 5520, 321, 29891, 295, 1161, 267, 322, 2989, 261, 321, 29891, 774, 798, 18180, 29889, 15154, 1207, 786, 4646, 368, 29889, 341, 6151, 2518, 322, 321, 29891, 774, 798, 9127, 29892, 2175, 373, 321, 29891, 295, 1161, 267, 322, 321, 29891, 774, 5727, 975, 11147, 674, 18658, 1438, 15409, 2578, 25414, 322, 674, 5557, 321, 29891, 295, 1161, 267, 322, 321, 29891, 774, 5727, 515, 15678, 636 +1, 26040, 29901, 1128, 304, 1207, 3632, 331, 1943, 6635, 1634, 514, 296, 29889, 3462, 278, 18853, 17182, 304, 263, 805, 764, 18046, 280, 29889, 1152, 278, 1634, 514, 296, 29892, 366, 29915, 645, 817, 263, 29871, 29906, 29899, 21543, 313, 29945, 29929, 286, 29880, 29897, 12917, 805, 764, 18046, 280, 29889, 317, 802, 29872, 911, 29871, 29906, 4441, 567, 310, 454, 3712, 18853, 17182, 1919, 29871, 29906, 4441, 567, 310, 8775, 24841, 18853, 17182, 29892 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 7375, 322, 269, 473, 22300, 29889, 323, 2209, 278, 282, 548, 658, 262, 411, 278, 6501, 577, 29891, 12507, 346, 29889, 3462, 29871, 29945, 29871, 1309, 778, 313, 29896, 29946, 29906, 330, 29897, 310, 10814, 6393, 282, 548, 658, 262, 393, 29915, 29879, 1063, 5700, 297, 29871, 30515, 29899, 22466, 313, 29953, 29899, 4317, 29897, 12003, 10076, 567, 322, 29871, 29906, 734, 294, 1129, 787, 313, 29896 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 19417, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 373, 18230, 388, 29889, 29301, 1432, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 10754, 2909, 322, 1432, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 1856, 3268, 366, 508, 1284, 29889, 6280, 4447, 675, 7535, 411, 3785, 11949, 29892, 278, 664, 310, 1532, 2998, 2874, 414, 29892, 12713, 22848, 29892, 1539, 1338, 29892, 25702 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 19531, 260, 4227, 29880, 1600, 384, 7901, 10412, 29889, 14542, 852, 263, 19875, 29891, 470, 1045, 3594, 260, 4227, 29880, 1600, 384, 363, 263, 901, 3209, 950, 1106, 29889, 319, 12003, 29892, 1302, 1537, 260, 4227, 29880, 1600, 384, 7901, 1008, 322, 263, 5101, 310, 1320, 1253, 287, 1444, 550, 338, 278, 4922, 982, 304, 7952, 14294, 373, 263, 11220, 4723, 355, 29889, 7357, 523, 2814, 470, 260 +1, 25453, 322, 17465, 292, 29901, 1128, 304, 17545, 901, 330, 2390, 267, 29889, 3462, 330, 2390, 267, 304, 596, 4497, 328, 29889, 319, 5972, 322, 4780, 982, 304, 7910, 278, 5253, 310, 330, 2390, 267, 297, 596, 14218, 652, 300, 338, 304, 28189, 263, 2846, 8870, 1490, 330, 2390, 267, 373, 2246, 310, 263, 301, 3322, 29899, 272, 270, 2559, 814, 603, 4497, 328, 29889, 450, 14225, 21054, 272, 322, 7990, 18459, 310, 278, 330 +1, 15202, 29901, 1128, 304, 11039, 403, 18655, 1849, 964, 263, 9045, 29891, 26044, 29889, 8561, 263, 18655, 519, 885, 2572, 569, 363, 26044, 29889, 319, 18655, 519, 885, 2572, 569, 338, 263, 2560, 270, 728, 1754, 491, 872, 329, 29948, 292, 18655, 1849, 297, 263, 4091, 340, 7243, 322, 769, 4417, 367, 2579, 29808, 975, 963, 304, 4808, 278, 270, 728, 4208, 29889, 14893, 491, 4417, 738, 18655, 1849, 366, 763, 29892, 3704, 373, 1080 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 263, 3632, 331, 1943, 9045, 29891, 3700, 471, 29882, 29889, 422, 26062, 599, 310, 278, 2348, 1127, 10070, 29889, 512, 263, 18350, 29899, 29879, 1891, 12580, 29880, 29892, 6837, 4208, 29871, 30226, 18002, 313, 29946, 29945, 330, 29897, 310, 29081, 288, 1446, 29892, 29871, 30515, 18002, 313, 29945, 29929, 286, 29880, 29897, 10849, 454, 3712, 3623, 625, 29892, 29871, 30515, 18002, 313, 29945, 29929, 286, 29880, 29897 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 1207, 18655, 13956, 286, 1878, 8345, 8310, 29891, 29889, 5701, 1082, 278, 373, 291, 322, 286, 1878, 18901, 29889, 940, 271, 29871, 30226, 18002, 313, 29896, 29906, 29900, 286, 29880, 29897, 310, 4805, 29899, 2405, 5359, 288, 9258, 17182, 297, 263, 2919, 12507, 346, 8357, 975, 18350, 29899, 9812, 12871, 29889, 9038, 278, 17182, 528, 6727, 414, 29892, 788, 29871, 30226, 310, 263, 2319, 373, 291, 393 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 6958, 10992, 2963, 29889, 15484, 263, 1246, 363, 278, 19075, 982, 304, 6159, 10992, 2963, 29889, 960, 366, 723, 763, 304, 505, 263, 2022, 29899, 517, 29899, 10532, 14983, 29892, 270, 616, 278, 2498, 297, 6578, 2722, 1196, 472, 29871, 29896, 29899, 29947, 29900, 29900, 29899, 29953, 29953, 29947, 29899, 29953, 29955, 29953, 29945, 29889, 2688, 29915, 276, 1722, 7398, 388, 304, 1424, 22394, 29871, 29955, 263 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 8267, 596, 321, 29891, 774, 5727, 411, 263, 8006, 272, 29889, 349, 27574, 263, 8006, 272, 1754, 10816, 363, 321, 29891, 774, 5727, 29889, 319, 3918, 8006, 272, 674, 451, 2367, 366, 13173, 3347, 29879, 29892, 322, 508, 367, 18215, 304, 671, 2978, 596, 5076, 29889, 8669, 20590, 385, 321, 29891, 774, 798, 8006, 272, 29892, 5069, 2319, 12995, 311, 674, 2367, 366, 278, 3347, 29879, 366, 13521 +1, 15202, 29901, 1128, 304, 5040, 23023, 1848, 321, 1218, 29889, 360, 8349, 7268, 403, 1546, 9128, 18757, 261, 322, 23023, 1848, 18757, 261, 29889, 1763, 18720, 278, 9946, 310, 596, 23023, 1848, 321, 1218, 29892, 372, 1122, 367, 5407, 304, 937, 2274, 746, 366, 526, 11223, 4824, 1711, 9074, 14793, 322, 746, 366, 526, 11223, 953, 8194, 635, 9074, 14793, 29889, 26991, 29892, 23023, 1848, 18757, 261, 5304, 373, 11584, 322, 23880, 5065, 5362, 29889 +1, 4231, 749, 322, 15197, 29901, 1128, 304, 289, 5790, 1044, 4856, 29889, 7519, 29883, 403, 7535, 1048, 278, 289, 5790, 1044, 29889, 450, 289, 5790, 1044, 29892, 884, 2998, 408, 278, 23729, 1458, 19119, 9045, 1044, 313, 1579, 272, 1458, 1002, 1082, 16385, 29871, 29941, 29929, 29946, 29892, 760, 474, 29897, 338, 263, 4307, 393, 471, 4502, 304, 9801, 322, 1072, 5987, 11176, 14703, 19119, 9045, 14502, 5786, 363, 1906, 23164, 515, 263, 19119, 4486 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 263, 5613, 15774, 23895, 2017, 29889, 4007, 6967, 278, 2348, 1127, 10070, 29889, 1152, 445, 9522, 412, 29892, 366, 674, 817, 278, 1494, 4452, 584, 29871, 29896, 2894, 293, 274, 2559, 314, 265, 12070, 1919, 29871, 29906, 29945, 2894, 293, 5881, 314, 290, 2532, 29879, 1919, 29871, 29896, 29945, 2894, 293, 17184, 1960, 1919, 29871, 29896, 2894, 293, 1109, 2911, 17796, 1919, 29871, 29896, 10849, 2894, 293 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 471, 29882, 4105, 4841, 29889, 14542, 852, 278, 1492, 528, 314, 1129, 29877, 322, 4195, 261, 29889, 5806, 738, 528, 314, 1129, 29877, 470, 4195, 261, 674, 664, 29892, 372, 338, 2253, 304, 671, 2730, 391, 332, 5281, 528, 314, 1129, 359, 322, 4195, 414, 29892, 7148, 565, 596, 8716, 29886, 338, 15589, 322, 372, 23766, 29889, 3834, 9316, 1316, 408, 1183, 29874, 2730, 391, 545, 263, 1341 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 1207, 263, 19408, 413, 3780, 12343, 7539, 282, 5863, 29889, 10306, 278, 11994, 363, 3907, 413, 295, 1188, 29887, 29915, 29879, 19408, 413, 3780, 29886, 583, 2578, 1446, 2441, 29889, 1670, 674, 3117, 367, 1048, 4203, 263, 9853, 310, 19408, 413, 3780, 29886, 583, 7539, 2175, 29889, 313, 697, 310, 278, 2625, 23633, 310, 1641, 278, 7984, 29892, 338, 366, 679, 304, 17545, 738, 29915, 454, 29888 +1, 349, 1691, 322, 24980, 1338, 29901, 1128, 304, 260, 4003, 8343, 263, 2653, 23717, 29889, 402, 1624, 596, 28075, 29889, 887, 674, 817, 263, 29871, 29896, 29906, 21759, 269, 4316, 19144, 29892, 263, 4964, 14051, 495, 8343, 292, 260, 4003, 29892, 322, 263, 29871, 29896, 29953, 29899, 22466, 318, 276, 386, 1705, 274, 493, 1308, 411, 263, 24235, 310, 29871, 29945, 285, 4615, 313, 1454, 2319, 26361, 29897, 322, 29871, 29947, 285, 4615, 313, 1454 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 2867, 297, 2373, 296, 454, 1624, 17394, 267, 29889, 5373, 29891, 17394, 267, 393, 6216, 1532, 29889, 3080, 326, 675, 278, 817, 363, 16116, 292, 470, 16679, 297, 491, 2805, 2373, 296, 454, 1624, 17394, 267, 393, 526, 2307, 263, 1781, 6216, 363, 366, 29889, 4001, 2373, 296, 454, 1624, 338, 380, 2593, 322, 29395, 990, 4357, 29892, 366, 29915, 276, 451, 2675, 304, 367, 2221, 304, 1735 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 9563, 470, 2329, 263, 528, 10511, 1283, 321, 29891, 774, 798, 29889, 14542, 852, 385, 321, 29891, 774, 798, 282, 3977, 309, 322, 4764, 672, 393, 338, 2788, 304, 596, 5613, 2927, 292, 29889, 960, 366, 1603, 505, 697, 310, 596, 321, 29891, 774, 5727, 29892, 445, 1795, 367, 263, 2217, 6775, 489, 5143, 1993, 278, 282, 3977, 309, 304, 278, 528, 1943, 310, 596, 321, 29891, 774 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 5376, 411, 540, 5031, 23448, 263, 29889, 19530, 29876, 278, 25828, 4835, 29889, 940, 5031, 23448, 263, 756, 263, 1353, 310, 25828, 4835, 393, 12234, 2615, 1546, 1023, 322, 4832, 11405, 515, 278, 2635, 310, 14060, 545, 29889, 3834, 310, 1438, 25828, 4835, 526, 10035, 29892, 763, 263, 1238, 369, 29892, 1550, 4045, 29892, 763, 432, 585, 299, 625, 29892, 526, 2649, 29873, 744, 18906, 310, 540 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 4529, 596, 923, 1416, 29890, 2873, 29889, 23868, 278, 1492, 11955, 29889, 887, 674, 817, 472, 3203, 1023, 1422, 528, 3076, 310, 1207, 786, 29901, 697, 393, 338, 16951, 6501, 261, 1135, 596, 5613, 19309, 16225, 363, 278, 528, 23626, 322, 697, 393, 338, 925, 263, 2217, 301, 14643, 1135, 596, 19309, 363, 278, 12141, 29879, 29889, 28277, 373, 596, 19309, 16225, 322, 24583, 29892, 1438, 508, 367 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 671, 429, 4542, 29875, 1218, 528, 314, 1129, 29877, 29889, 317, 802, 29872, 911, 263, 12616, 29899, 29879, 1891, 8828, 4757, 310, 528, 314, 1129, 29877, 304, 596, 5112, 29885, 29889, 319, 2217, 2586, 310, 429, 4542, 29875, 1218, 528, 314, 1129, 29877, 5771, 263, 1472, 982, 29889, 2860, 7990, 1259, 596, 11315, 297, 278, 1510, 261, 408, 366, 12891, 723, 29892, 269, 802, 29872, 911, 1048, 263 +1, 25453, 322, 17465, 292, 29901, 1128, 304, 1207, 521, 332, 307, 26163, 5036, 12580, 3137, 29889, 4721, 354, 271, 278, 288, 854, 304, 29871, 29946, 29945, 29900, 6719, 285, 21446, 6884, 470, 29871, 29906, 29906, 29945, 6719, 6432, 1039, 375, 636, 4122, 559, 263, 286, 3096, 262, 260, 764, 491, 285, 492, 3262, 372, 373, 967, 2625, 29889, 8669, 310, 805, 764, 292, 1661, 29899, 303, 860, 1395, 5832, 805, 764, 297, 278, 4251, 310 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 6483, 285, 719, 541, 725, 329, 10674, 1161, 29889, 14542, 852, 385, 17182, 29889, 1932, 23906, 385, 17182, 304, 6483, 285, 719, 596, 541, 725, 329, 10674, 1161, 297, 29892, 372, 29915, 29879, 4100, 304, 5839, 697, 393, 756, 263, 6133, 25158, 1298, 1135, 278, 7984, 292, 10430, 29889, 1152, 1342, 29892, 565, 366, 505, 263, 9687, 393, 4225, 304, 367, 7984, 287, 472, 29871, 29941, 29945 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 385, 288, 9258, 17182, 2730, 391, 332, 3950, 29889, 349, 27574, 953, 7273, 9215, 281, 1165, 515, 263, 3240, 737, 261, 393, 4266, 7093, 297, 28075, 363, 3907, 3632, 331, 1943, 6776, 2527, 1199, 29889, 1670, 526, 1784, 5376, 414, 393, 508, 367, 1476, 7395, 1058, 19417, 953, 7273, 9215, 281, 1165, 636, 5373, 29891, 777, 18853, 288, 2719, 393, 366, 723, 763, 304, 671, 297, 596 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 5376, 411, 14919, 21549, 29889, 1260, 8332, 403, 14919, 21549, 29899, 513, 1682, 292, 9687, 322, 29914, 272, 13748, 515, 596, 652, 300, 29889, 739, 10083, 2560, 29892, 541, 6480, 825, 366, 2348, 342, 14218, 508, 505, 263, 12176, 10879, 373, 596, 14919, 21549, 11174, 29889, 960, 366, 8369, 7535, 11223, 24937, 29892, 7243, 18219, 29892, 470, 851, 11517, 1432, 2462, 29892, 3814, 304, 2334, 472, 3203 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 19531, 3708, 552, 17441, 303, 860, 29889, 14542, 852, 12528, 301, 309, 562, 470, 22181, 1581, 17441, 303, 860, 363, 6534, 19309, 260, 2873, 29889, 7419, 363, 301, 14643, 29899, 2780, 287, 3708, 552, 17441, 303, 7358, 411, 7254, 22332, 2873, 29892, 1316, 408, 540, 1624, 470, 3805, 275, 528, 3076, 29892, 304, 1035, 296, 27240, 278, 7254, 22332, 2873, 297, 596, 15509, 19309, 29889, 4525, 674, 19595 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 596, 269, 484, 21079, 1106, 716, 1449, 29889, 399, 1161, 10508, 269, 484, 21079, 297, 278, 471, 2790, 4933, 29889, 960, 366, 505, 777, 26616, 10508, 269, 484, 21079, 29892, 366, 508, 5948, 679, 963, 5941, 491, 17452, 963, 297, 278, 471, 2790, 4933, 29892, 925, 408, 366, 723, 738, 916, 26616, 7171, 358, 29889, 887, 1122, 884, 367, 2221, 304, 471, 29882, 777, 1661, 29899, 15257 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 6755, 263, 18870, 262, 398, 9228, 29889, 14542, 852, 263, 9228, 411, 1880, 3708, 537, 29889, 1094, 411, 599, 758, 8802, 1539, 1338, 29892, 18870, 262, 398, 1818, 367, 394, 2376, 287, 411, 916, 1539, 1338, 297, 1797, 304, 6176, 278, 2898, 2264, 3734, 363, 432, 809, 295, 719, 29889, 739, 338, 4049, 394, 2376, 287, 411, 1661, 29899, 1457, 8802, 1539, 1338, 763, 1302, 2496, 470, 274 diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 10e39fd39..bc917df72 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -98,7 +98,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten head_num_, size_per_head_, rotary_embedding_dim_, - neox_rotary_style_, + false, attention_weights->query_weight.scale_out, 0, // int8_mode stream_); @@ -294,20 +294,15 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse, - int int8_mode): - BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + bool is_qk_buf_float): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), max_batch_size_(max_batch_size), max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), rotary_embedding_dim_(0), - neox_rotary_style_(false), - is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), - weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) + is_qk_buf_float_(is_qk_buf_float) { } @@ -321,20 +316,15 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse, - int int8_mode): - BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + bool is_qk_buf_float): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), max_batch_size_(max_batch_size), max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), rotary_embedding_dim_(0), - neox_rotary_style_(false), - is_qk_buf_float_(is_qk_buf_float || int8_mode == 2), - weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) + is_qk_buf_float_(is_qk_buf_float) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); @@ -347,25 +337,19 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b size_t size_per_head, size_t local_head_num, size_t rotary_embedding_dim, - bool neox_rotary_style, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse, - int int8_mode): - BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), + bool is_qk_buf_float): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), max_batch_size_(max_batch_size), max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), rotary_embedding_dim_(rotary_embedding_dim), - neox_rotary_style_(neox_rotary_style), - is_qk_buf_float_(is_qk_buf_float), - weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared>() : nullptr), - int8_fc_runner_(int8_mode == 2 ? std::make_shared>() : nullptr) + is_qk_buf_float_(is_qk_buf_float) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); @@ -376,18 +360,14 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionL BaseAttentionLayer(attention_layer.stream_, attention_layer.cublas_wrapper_, attention_layer.allocator_, - attention_layer.is_free_buffer_after_forward_, - attention_layer.sparse_), + attention_layer.is_free_buffer_after_forward_), max_batch_size_(attention_layer.max_batch_size_), max_seq_len_(attention_layer.max_seq_len_), head_num_(attention_layer.head_num_), size_per_head_(attention_layer.size_per_head_), hidden_units_(attention_layer.hidden_units_), rotary_embedding_dim_(attention_layer.rotary_embedding_dim_), - neox_rotary_style_(attention_layer.neox_rotary_style_), - is_qk_buf_float_(attention_layer.is_qk_buf_float_), - weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_), - int8_fc_runner_(attention_layer.int8_fc_runner_) + is_qk_buf_float_(attention_layer.is_qk_buf_float_) { } @@ -408,25 +388,25 @@ template void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true); + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false); k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true); + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false); } else { allocator_->free((void**)(&qk_buf_)); } - qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true); - qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true); + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false); } else { allocator_->free((void**)(&qk_buf_float_)); @@ -451,12 +431,6 @@ void LLaMAContextAttentionLayer::freeBuffer() allocator_->free((void**)(&qk_buf_float_)); } - allocator_->free((void**)(&mixed_gemm_workspace_)); - mixed_gemm_ws_bytes_ = 0; - - allocator_->free((void**)(&int8_gemm_workspace_)); - int8_gemm_ws_bytes_ = 0; - is_allocate_buffer_ = false; } } diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index e9086e278..635d3d15a 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -36,7 +36,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { const size_t size_per_head_; const size_t hidden_units_; const size_t rotary_embedding_dim_; - const bool neox_rotary_style_; // fmha runner int sm_ = getSMVersion(); @@ -52,13 +51,9 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { bool is_qk_buf_float_; - std::shared_ptr> weight_only_int8_fc_runner_; - std::shared_ptr> int8_fc_runner_; - protected: using BaseAttentionLayer::allocator_; using BaseAttentionLayer::stream_; - using BaseAttentionLayer::sparse_; T* qkv_buf_ = nullptr; T* q_buf_2_ = nullptr; T* k_buf_2_ = nullptr; @@ -67,10 +62,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { float* qk_buf_float_ = nullptr; T* qkv_buf_2_ = nullptr; T* qkv_buf_3_ = nullptr; - char* mixed_gemm_workspace_ = nullptr; - size_t mixed_gemm_ws_bytes_ = 0; - char* int8_gemm_workspace_ = nullptr; - size_t int8_gemm_ws_bytes_ = 0; public: LLaMAContextAttentionLayer(size_t max_batch_size, @@ -81,9 +72,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + bool is_qk_buf_float); LLaMAContextAttentionLayer(size_t max_batch_size, size_t max_seq_len, @@ -94,9 +83,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + bool is_qk_buf_float); LLaMAContextAttentionLayer(size_t max_batch_size, size_t max_seq_len, @@ -104,14 +91,11 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { size_t size_per_head, size_t local_head_num, size_t rotary_embedding_dim, - bool neox_rotary_style_, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, - bool sparse = false, - int int8_mode = 0); + bool is_qk_buf_float); LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 02d46b5b9..e0d8d1c99 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -31,7 +31,6 @@ void LLaMA::initialize() inter_size_, num_layer_, rotary_embedding_dim_, - neox_rotary_style_, layernorm_eps_, pipeline_para_, stream_, @@ -39,9 +38,7 @@ void LLaMA::initialize() allocator_, is_free_buffer_after_forward_, is_context_qk_buf_float_, - attention_type_, - custom_all_reduce_comm_, - enable_custom_all_reduce_); + attention_type_); } template @@ -61,22 +58,14 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_ input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false)); decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); - logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * vocab_size_, false)); - sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batch_size, false)); + //logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * vocab_size_, false)); - key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); + key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); value_cache_ = key_cache_ + self_cache_size; tiled_input_ids_buf_ = - (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, true)); - tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, true)); - - transposed_output_ids_buf_ = - (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true)); - output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true)); - - start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); - end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); + (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, false)); + tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false)); context_decoder_input_buf_ = (T*)(allocator_->reMalloc( context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); @@ -92,8 +81,7 @@ void LLaMA::freeBuffer() if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); allocator_->free((void**)(&decoder_output_buf_)); - allocator_->free((void**)(&logits_buf_)); - allocator_->free((void**)(&sequence_lengths_)); + //allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&key_cache_)); if (cache_indirections_[0] != nullptr) { @@ -103,11 +91,6 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&tiled_input_ids_buf_)); allocator_->free((void**)(&tiled_input_lengths_buf_)); - allocator_->free((void**)(&transposed_output_ids_buf_)); - allocator_->free((void**)(&output_ids_buf_)); - allocator_->free((void**)(&start_ids_buf_)); - allocator_->free((void**)(&end_ids_buf_)); - allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); @@ -128,9 +111,7 @@ LLaMA::LLaMA(size_t head_num, IAllocator* allocator, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -161,9 +142,7 @@ LLaMA::LLaMA(size_t head_num, IAllocator* allocator, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -173,8 +152,6 @@ LLaMA::LLaMA(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), hidden_units_(head_num * size_per_head), pipeline_para_(pipeline_para), - custom_all_reduce_comm_(custom_all_reduce_comm), - enable_custom_all_reduce_(enable_custom_all_reduce), attention_type_(attention_type) { initialize(); @@ -191,8 +168,6 @@ LLaMA::LLaMA(LLaMA const& llama): rotary_embedding_dim_(llama.rotary_embedding_dim_), hidden_units_(llama.hidden_units_), pipeline_para_(llama.pipeline_para_), - custom_all_reduce_comm_(llama.custom_all_reduce_comm_), - enable_custom_all_reduce_(llama.enable_custom_all_reduce_), attention_type_(llama.attention_type_) { initialize(); @@ -205,20 +180,6 @@ LLaMA::~LLaMA() freeBuffer(); } -template -void LLaMA::registerCallback(callback_sig* fn, void* ctx) -{ - token_generated_cb_ = fn; - token_generated_ctx_ = ctx; -} - -template -void LLaMA::unRegisterCallback() -{ - token_generated_cb_ = nullptr; - token_generated_ctx_ = nullptr; -} - template void LLaMA::forward(std::vector* output_tensors, const std::vector* input_tensors, @@ -274,8 +235,8 @@ void LLaMA::forward(std::unordered_map* output_ten max_seq_len); } const cudaDataType_t gemm_data_type = getCudaDataType(); - allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length); + allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length); sync_check_cuda_error(); const DataType data_type = getTensorType(); @@ -288,11 +249,6 @@ void LLaMA::forward(std::unordered_map* output_ten const std::vector self_v_cache_shape = { num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_}; - // initialize the output ids and parent ids - cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * max_seq_len, stream_); - sync_check_cuda_error(); - - // handle first step invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, input_tensors->at("input_ids").getPtr(), @@ -304,7 +260,7 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - output_ids_buf_, + nullptr, llama_weights->pre_decoder_embedding_table, llama_weights->position_encoding_table, pPromptTuningParam{}, // no p/prompt tuning @@ -347,7 +303,6 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { - T* output_logits = output_tensors->at("output_logits").getPtr(); invokeGeneralLLaMALayerNorm(context_decoder_input_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, @@ -357,6 +312,9 @@ void LLaMA::forward(std::unordered_map* output_ten hidden_units_, stream_); sync_check_cuda_error(); + + // FIXME: debugging + T *output_logits = output_tensors->at("output_logits").getPtr(); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, vocab_size_, @@ -367,6 +325,7 @@ void LLaMA::forward(std::unordered_map* output_ten context_decoder_input_buf_, hidden_units_, // n output_logits, + //logits_buf_, vocab_size_); sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 3b7995927..26d1a6696 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -36,27 +36,22 @@ class LLaMA: public BaseLayer { size_t vocab_size_; size_t rotary_embedding_dim_; - static constexpr bool neox_rotary_style_ = false; - static constexpr float layernorm_eps_ = 1e-6f; + static constexpr float layernorm_eps_ = 1e-6f; size_t hidden_units_; NcclParam tensor_para_; NcclParam pipeline_para_; - std::shared_ptr custom_all_reduce_comm_; - int enable_custom_all_reduce_; - AttentionType attention_type_; const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); - LLaMAContextDecoder* llama_context_decoder_; + LLaMAContextDecoder* llama_context_decoder_; void allocateBuffer() override; - void allocateBuffer( - size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); void freeBuffer() override; void initialize(); @@ -67,64 +62,49 @@ class LLaMA: public BaseLayer { float* logits_buf_; - int* sequence_lengths_ = nullptr; - T* key_cache_; T* value_cache_; int* cache_indirections_[2] = {nullptr, nullptr}; - int* tiled_input_ids_buf_; - int* tiled_input_lengths_buf_; - int* transposed_output_ids_buf_; - int* output_ids_buf_; - int* start_ids_buf_; - int* end_ids_buf_; + int* tiled_input_ids_buf_; + int* tiled_input_lengths_buf_; - T* context_decoder_input_buf_; - T* context_decoder_output_buf_; - - // function pointer callback - using callback_sig = void(std::unordered_map*, void*); - callback_sig* token_generated_cb_ = nullptr; - void* token_generated_ctx_ = nullptr; + T* context_decoder_input_buf_; + T* context_decoder_output_buf_; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); public: - LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); - - LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + unsigned long long random_seed, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA); + + LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + unsigned long long random_seed, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA); LLaMA(LLaMA const& LLaMA); @@ -143,9 +123,6 @@ class LLaMA: public BaseLayer { size_t getTensorParallelRank(); size_t getTensorParallelSize(); bool* getFinishBuffer(); - - void registerCallback(callback_sig* fn, void* ctx); - void unRegisterCallback(); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 49af917de..06541af4b 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -32,14 +32,11 @@ void LLaMAContextDecoder::initialize() size_per_head_, head_num_, rotary_embedding_dim_, - neox_rotary_style_, stream_, cublas_wrapper_, allocator_, is_free_buffer_after_forward_, - is_qk_buf_float_, - false, - 0); + is_qk_buf_float_); ffn_layer_ = new SiluFfnLayer(0, // max_batch_size 0, // max_seq_len @@ -130,7 +127,6 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t size_t inter_size, size_t num_layer, size_t rotary_embedding_dim, - bool neox_rotary_style, float layernorm_eps, NcclParam pipeline_para, cudaStream_t stream, @@ -138,23 +134,18 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t IAllocator* allocator, bool is_free_buffer_after_forward, bool is_qk_buf_float, - AttentionType attention_type, - std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), head_num_(head_num), size_per_head_(size_per_head), inter_size_(inter_size), num_layer_(num_layer), rotary_embedding_dim_(rotary_embedding_dim), - neox_rotary_style_(neox_rotary_style), layernorm_eps_(layernorm_eps), hidden_units_(head_num * size_per_head), pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), - attention_type_(attention_type), - custom_all_reduce_comm_(custom_all_reduce_comm), - enable_custom_all_reduce_(enable_custom_all_reduce) + attention_type_(attention_type) { initialize(); } @@ -167,14 +158,11 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode inter_size_(decoder.inter_size_), num_layer_(decoder.num_layer_), rotary_embedding_dim_(decoder.rotary_embedding_dim_), - neox_rotary_style_(decoder.neox_rotary_style_), layernorm_eps_(decoder.layernorm_eps_), hidden_units_(decoder.hidden_units_), pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), - attention_type_(decoder.attention_type_), - custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), - enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) + attention_type_(decoder.attention_type_) { initialize(); } @@ -253,6 +241,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* AttentionType attention_type = attention_type_; const bool is_unpadded_mha = isUnPaddedMHA(attention_type); + size_t h_token_num = batch_size * seq_len; if (is_unpadded_mha) { const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index 115b3b06b..452567208 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -41,7 +41,6 @@ class LLaMAContextDecoder: public BaseLayer { size_t inter_size_; size_t num_layer_; size_t rotary_embedding_dim_; - bool neox_rotary_style_; float layernorm_eps_; // calculated data @@ -49,9 +48,6 @@ class LLaMAContextDecoder: public BaseLayer { NcclParam pipeline_para_; - std::shared_ptr custom_all_reduce_comm_; - int enable_custom_all_reduce_; - AttentionType attention_type_; bool is_qk_buf_float_; @@ -85,7 +81,6 @@ class LLaMAContextDecoder: public BaseLayer { size_t inter_size, size_t num_layer, size_t rotary_embedding_dim, - bool neox_rotary_style, float layernorm_eps, NcclParam pipeline_para, cudaStream_t stream, @@ -93,9 +88,7 @@ class LLaMAContextDecoder: public BaseLayer { IAllocator* allocator, bool is_free_buffer_after_forward, bool is_qk_buf_float, - AttentionType attention_type = AttentionType::FUSED_MHA, - std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce_ = 0); + AttentionType attention_type = AttentionType::FUSED_MHA); LLaMAContextDecoder(LLaMAContextDecoder const& decoder); From 321bc736ad9086d41b7191bf85ff6d5f1b728980 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 18 Sep 2023 15:58:57 +0000 Subject: [PATCH 19/55] buf fix --- .../layers/attention_layers/LLaMAContextAttentionLayer.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index bc917df72..f22fa3032 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -399,6 +399,7 @@ void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq } else { allocator_->free((void**)(&qk_buf_)); + qk_buf_ = nullptr; } qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); @@ -410,6 +411,7 @@ void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq } else { allocator_->free((void**)(&qk_buf_float_)); + qk_buf_float_ = nullptr; } } From 837e9d7ab3a801cafa7ac0f570ba4f211e368dac Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 18 Sep 2023 16:22:09 +0000 Subject: [PATCH 20/55] dump --- examples/cpp/llama/llama_example.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index d2c8dcf51..721f4aef5 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -269,6 +269,7 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); + /* if (rank == world_size - 1) { T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); cudaMemcpy(out, @@ -289,6 +290,7 @@ void llama_example(const INIReader reader) std::cout << "\n"; free(out); } + */ // test time cudaProfilerStart(); @@ -301,7 +303,7 @@ void llama_example(const INIReader reader) ft_nvtx::setScope("total_time"); PUSH_RANGE("total time") // warm up - ite = 3; + ite = 10; for (int i = 0; i < ite; ++i) { llama.forward(&output_tensors, &input_tensors, &llama_weights); } From 56c33256a96c61acfaa06fadedd0b40f8b60ea82 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 18 Sep 2023 17:09:55 +0000 Subject: [PATCH 21/55] add gemm_cofing.in --- examples/cpp/llama/gemm_config.in | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 examples/cpp/llama/gemm_config.in diff --git a/examples/cpp/llama/gemm_config.in b/examples/cpp/llama/gemm_config.in new file mode 100644 index 000000000..8a93b9027 --- /dev/null +++ b/examples/cpp/llama/gemm_config.in @@ -0,0 +1,12 @@ +batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time +32 256 52 128 1 ### 1 19968 8192 6656 21 0 24 1 0 0 0 0 20.595835 +32 256 52 128 1 ### 1664 256 256 128 103 -1 -1 -1 -1 -1 -1 -1 0.929050 +32 256 52 128 1 ### 1664 128 256 256 103 -1 -1 -1 -1 -1 -1 -1 0.661050 +32 256 52 128 1 ### 1 6656 8192 6656 21 0 24 1 0 0 0 0 6.882683 +32 256 52 128 1 ### 1 17920 8192 6656 21 0 24 1 0 0 0 0 18.293156 +32 256 52 128 1 ### 1 6656 8192 17920 21 0 24 2 0 1 6656 0 18.400911 +32 1 52 128 1 ### 1 19968 32 6656 3 0 21 1 1 0 0 0 0.328397 +32 1 52 128 1 ### 1 6656 32 6656 21 0 15 6 0 1 416 0 0.131215 +32 1 52 128 1 ### 1 17920 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.306050 +32 1 52 128 1 ### 1 6656 32 17920 21 0 15 6 0 1 416 0 0.312504 +32 1 52 128 1 ### 1 32000 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.753770 From 4a0a9d708ea1780f47e711ffd89d2ec286c12c05 Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 19 Sep 2023 03:54:41 +0000 Subject: [PATCH 22/55] remove backup file trace --- examples/cpp/llama/backup.csv | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 examples/cpp/llama/backup.csv diff --git a/examples/cpp/llama/backup.csv b/examples/cpp/llama/backup.csv deleted file mode 100644 index eb28ed345..000000000 --- a/examples/cpp/llama/backup.csv +++ /dev/null @@ -1,32 +0,0 @@ -1, 14542, 3262, 8112, 29901, 7803, 1757, 526, 13407, 297, 263, 13569, 29889, 2688 -1, 7392, 1026, 29901, 319, 11379, 15028, 297, 263, 17948, 8693, 29889, 450, 11379 -1, 5057, 12500, 29901, 319, 8023, 338, 2734, 1623, 263, 5702, 29889, 450, 8023 -1, 17984, 18558, 29901, 1334, 1074, 263, 767, 13407, 297, 263, 5716, 29889, 450, 767 -1, 19509, 263, 1766, 25206, 29901, 11647, 526, 2734, 373, 278, 11952, 29889, 319, 767 -1, 7412, 292, 11565, 29901, 11647, 526, 16246, 5742, 6131, 13587, 3949, 18464, 29889, 11647 -1, 7412, 292, 10311, 26082, 29901, 319, 767, 338, 16246, 2768, 263, 5716, 29889, 940 -1, 323, 4524, 29901, 319, 767, 322, 6114, 526, 16246, 373, 263, 7408, 4208, 29889, 2688 -1, 8565, 375, 3183, 29901, 319, 767, 338, 13407, 701, 322, 8026, 263, 7679, 29889, 11647 -1, 12878, 292, 278, 11203, 29901, 319, 6114, 17042, 2039, 714, 11480, 278, 17455, 29889, 7803, 2319, 26361 -1, 5057, 12500, 29901, 319, 6114, 338, 22049, 3412, 263, 5702, 29889, 2296 -1, 8565, 11203, 29901, 319, 2919, 19174, 338, 22229, 2820, 263, 1746, 29889, 11647 -1, 8360, 5367, 29901, 319, 6114, 338, 409, 630, 472, 263, 1591, 29889, 2296 -1, 476, 484, 14067, 29901, 319, 767, 17905, 1379, 373, 263, 17132, 29889, 450, 767 -1, 7412, 292, 378, 25496, 29901, 319, 767, 338, 16246, 5742, 1023, 28987, 29889, 940 -1, 1706, 262, 1076, 29901, 319, 767, 338, 16246, 373, 385, 15058, 4768, 446, 29889, 940 -1, 390, 5086, 11308, 29901, 319, 767, 338, 1153, 9292, 11308, 297, 263, 29413, 29889, 940 -1, 7412, 292, 11210, 336, 29901, 319, 767, 715, 16926, 263, 21387, 964, 670, 11210, 29889, 940 -1, 7412, 292, 11210, 336, 29901, 319, 4123, 767, 269, 1169, 373, 263, 6592, 29889, 450, 767 -1, 28551, 292, 29901, 11647, 526, 13407, 373, 263, 17306, 310, 15007, 29889, 11647 -1, 8481, 24613, 1847, 29901, 319, 767, 338, 13407, 373, 263, 19587, 11952, 29889, 940 -1, 7412, 292, 1248, 29877, 29901, 11647, 526, 2381, 25217, 297, 278, 4094, 29889, 7803, 5866 -1, 5057, 12500, 29901, 319, 767, 15028, 297, 278, 7256, 310, 263, 10728, 1974, 29889, 29445, 6289 -1, 5057, 12500, 29901, 319, 767, 338, 4318, 2734, 1623, 263, 5702, 29889, 940 -1, 6781, 8522, 29901, 11647, 526, 13407, 373, 263, 1746, 9963, 29889, 450, 1757 -1, 8565, 11203, 29901, 319, 6114, 338, 8743, 411, 263, 11203, 29889, 450, 11203 -1, 3925, 25217, 29901, 319, 2381, 25217, 11565, 338, 4318, 297, 263, 5716, 29889, 7567 -1, 6163, 23131, 292, 29901, 12753, 2305, 748, 23131, 292, 1623, 263, 10952, 29889, 2688 -1, 399, 336, 3262, 22981, 29901, 1334, 1074, 263, 3800, 373, 18187, 29889, 319, 2022 -1, 28551, 292, 29901, 319, 2022, 338, 14993, 292, 1623, 263, 17306, 310, 15007, 29889, 2688 -1, 399, 1161, 292, 3700, 29901, 319, 6114, 338, 13407, 297, 263, 5716, 9963, 29889, 2296 -1, 2522, 11495, 1933, 292, 29901, 319, 767, 338, 1090, 4094, 297, 263, 11565, 29889, 940 From e63b85b57bc4e1d49172b323a2e658e72c8e2f2d Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 19 Sep 2023 03:56:16 +0000 Subject: [PATCH 23/55] remove gemm_config.in --- examples/cpp/llama/gemm_config.in | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 examples/cpp/llama/gemm_config.in diff --git a/examples/cpp/llama/gemm_config.in b/examples/cpp/llama/gemm_config.in deleted file mode 100644 index 8a93b9027..000000000 --- a/examples/cpp/llama/gemm_config.in +++ /dev/null @@ -1,12 +0,0 @@ -batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time -32 256 52 128 1 ### 1 19968 8192 6656 21 0 24 1 0 0 0 0 20.595835 -32 256 52 128 1 ### 1664 256 256 128 103 -1 -1 -1 -1 -1 -1 -1 0.929050 -32 256 52 128 1 ### 1664 128 256 256 103 -1 -1 -1 -1 -1 -1 -1 0.661050 -32 256 52 128 1 ### 1 6656 8192 6656 21 0 24 1 0 0 0 0 6.882683 -32 256 52 128 1 ### 1 17920 8192 6656 21 0 24 1 0 0 0 0 18.293156 -32 256 52 128 1 ### 1 6656 8192 17920 21 0 24 2 0 1 6656 0 18.400911 -32 1 52 128 1 ### 1 19968 32 6656 3 0 21 1 1 0 0 0 0.328397 -32 1 52 128 1 ### 1 6656 32 6656 21 0 15 6 0 1 416 0 0.131215 -32 1 52 128 1 ### 1 17920 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.306050 -32 1 52 128 1 ### 1 6656 32 17920 21 0 15 6 0 1 416 0 0.312504 -32 1 52 128 1 ### 1 32000 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.753770 From 3a103088af353c2881e7be3fda3a9e00dc4dc4d4 Mon Sep 17 00:00:00 2001 From: dypshong Date: Wed, 20 Sep 2023 07:05:56 +0000 Subject: [PATCH 24/55] dumdump --- examples/cpp/llama/llama_example.cc | 2 - .../kernels/bert_preprocess_kernels.cu | 2 +- src/fastertransformer/models/llama/LLaMA.cc | 95 ++++++++++--------- .../models/llama/LLaMAContextDecoder.cc | 27 +++--- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 721f4aef5..2359cf022 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -269,7 +269,6 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); - /* if (rank == world_size - 1) { T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); cudaMemcpy(out, @@ -290,7 +289,6 @@ void llama_example(const INIReader reader) std::cout << "\n"; free(out); } - */ // test time cudaProfilerStart(); diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index a57161c85..8179c3368 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_ input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false)); decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); - //logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * vocab_size_, false)); + // logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * + // vocab_size_, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); value_cache_ = key_cache_ + self_cache_size; @@ -81,7 +82,7 @@ void LLaMA::freeBuffer() if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); allocator_->free((void**)(&decoder_output_buf_)); - //allocator_->free((void**)(&logits_buf_)); + // allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&key_cache_)); if (cache_indirections_[0] != nullptr) { @@ -99,19 +100,19 @@ void LLaMA::freeBuffer() } template -LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type): +LLaMA::LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + unsigned long long random_seed, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -128,21 +129,21 @@ LLaMA::LLaMA(size_t head_num, } template -LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type): +LLaMA::LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + unsigned long long random_seed, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -259,24 +260,26 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); - invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - nullptr, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - pPromptTuningParam{}, // no p/prompt tuning - tiled_input_ids_buf_, - 1, - max_input_length, - max_input_length, - batch_size, - hidden_units_, - stream_); - sync_check_cuda_error(); - invokeBuildDecoderAttentionMask( input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_); sync_check_cuda_error(); + if (pipeline_para_.rank_ == 0) { + invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, + nullptr, + llama_weights->pre_decoder_embedding_table, + llama_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + } + std::unordered_map decoder_input_tensors{ {"decoder_input", Tensor{ @@ -314,7 +317,7 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); // FIXME: debugging - T *output_logits = output_tensors->at("output_logits").getPtr(); + T* output_logits = output_tensors->at("output_logits").getPtr(); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, vocab_size_, @@ -325,7 +328,7 @@ void LLaMA::forward(std::unordered_map* output_ten context_decoder_input_buf_, hidden_units_, // n output_logits, - //logits_buf_, + // logits_buf_, vocab_size_); sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 06541af4b..08980923a 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -122,19 +122,19 @@ int LLaMAContextDecoder::getFirstLayerParallelId() } template -LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - AttentionType attention_type): +LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + float layernorm_eps, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), head_num_(head_num), size_per_head_(size_per_head), @@ -241,7 +241,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* AttentionType attention_type = attention_type_; const bool is_unpadded_mha = isUnPaddedMHA(attention_type); - size_t h_token_num = batch_size * seq_len; if (is_unpadded_mha) { const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); From be298831407d766df29d8c173ffad2bf99e56499 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 22 Sep 2023 18:36:48 +0000 Subject: [PATCH 25/55] test done --- examples/cpp/llama/llama_config.ini | 4 +- examples/cpp/llama/llama_example.cc | 33 ++- .../LLaMAContextAttentionLayer.cc | 70 +++--- .../LLaMAContextAttentionLayer.h | 40 ++-- src/fastertransformer/models/llama/LLaMA.cc | 207 +++++++---------- src/fastertransformer/models/llama/LLaMA.h | 67 +++--- .../models/llama/LLaMAContextDecoder.cc | 47 ++-- src/fastertransformer/th_op/llama/LLaMA.cc | 106 +++------ src/fastertransformer/th_op/llama/LLaMA.h | 216 ++++++------------ src/fastertransformer/utils/memory_utils.cu | 2 +- 10 files changed, 295 insertions(+), 497 deletions(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 931b24e5d..3df66269f 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -7,6 +7,7 @@ pipeline_para_size=4 [request] request_batch_size=32 +start_pos=2 [llama_33B] head_num=52 @@ -15,5 +16,6 @@ vocab_size=32000 decoder_layers=60 rotary_embedding=128 multiple_of=256 -max_cache_seq_len=1024 +max_seq_len=1024 padding_id=0 +random_seed=0 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 2359cf022..3065d4873 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -81,14 +81,15 @@ void llama_example(const INIReader reader) const size_t decoder_layers = reader.GetInteger(model_name, "decoder_layers"); const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); const int multiple_of = reader.GetInteger(model_name, "multiple_of"); - const size_t max_cache_seq_len = reader.GetInteger(model_name, "max_cache_seq_len"); + const size_t max_seq_len = reader.GetInteger(model_name, "max_seq_len"); const size_t hidden_units = head_num * size_per_head; const size_t inter_size = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of); const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); - const int min_length = reader.GetInteger("request", "min_length", 0); const int padding_id = reader.GetInteger(model_name, "padding_id"); + int start_pos = reader.GetInteger("request", "start_pos", 0); + unsigned long long random_seed = reader.GetInteger("request", "random_seed", 0); FT_CHECK(decoder_layers % pipeline_para_size == 0); @@ -181,10 +182,7 @@ void llama_example(const INIReader reader) model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; llama_weights.loadModel(model_dir); - unsigned long long random_seed; - if (rank == 0) { - random_seed = (unsigned long long)(0); - } + if (world_size > 1) { mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD); } @@ -193,7 +191,7 @@ void llama_example(const INIReader reader) getAttentionType(size_per_head, getSMVersion(), !((std::getenv("SHONG_PADDING") != nullptr) - && (std::string(std::getenv("SHONG_PADDING")) == "ON")), //true, // remove_padding + && (std::string(std::getenv("SHONG_PADDING")) == "ON")), // true, // remove_padding 0, // llama supports any-seq-length fmha true, // is_fuse false, // with_relative_position_bias @@ -221,6 +219,7 @@ void llama_example(const INIReader reader) vocab_size, rotary_embedding_dim, random_seed, + max_seq_len, tensor_para, pipeline_para, stream, @@ -230,23 +229,18 @@ void llama_example(const INIReader reader) &prop, attention_type); - T* d_output_logits; + float* d_output_logits; deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false); - std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, - {"output_seq_len", - Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, - {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, - {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}, - {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &max_cache_seq_len}}}; + {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &start_pos}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_logits", Tensor{MEMORY_GPU, - TYPE_FP16, + TYPE_FP32, std::vector{request_batch_size, (size_t)total_output_len, vocab_size}, d_output_logits}}}; @@ -269,12 +263,14 @@ void llama_example(const INIReader reader) POP_RANGE; ft_nvtx::resetScope(); + /* if (rank == world_size - 1) { - T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size); + float* out = (float*)malloc(sizeof(float) * request_batch_size * total_output_len * vocab_size); cudaMemcpy(out, d_output_logits, - sizeof(T) * request_batch_size * total_output_len * vocab_size, - cudaMemcpyDeviceToHost); + sizeof(float) * request_batch_size * total_output_len * vocab_size, + cudaMemcpyDeviceToHost + ); for (int b = 0; b < request_batch_size; ++b) { std::cout << "["; for (int s = 0; s < total_output_len; ++s) { @@ -289,6 +285,7 @@ void llama_example(const INIReader reader) std::cout << "\n"; free(out); } + */ // test time cudaProfilerStart(); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index f22fa3032..f0dfce8c7 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -43,11 +43,12 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 5); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); - const int request_batch_size = input_tensors->at("attention_mask").shape[0]; - const int request_seq_len = input_tensors->at("attention_mask").shape[2]; - const int layer_id = input_tensors->getVal("layer_id"); - const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const int batch_size = input_tensors->at("attention_mask").shape[0]; + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int layer_id = input_tensors->getVal("layer_id"); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + size_t start_pos = input_tensors->at("start_pos").max(); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -58,11 +59,11 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten "LLaMA Context FUSED_PADDED_MHA is not supported !"); PUSH_RANGE("attention buffer alloc"); - allocateBuffer(request_batch_size, request_seq_len, attention_type != AttentionType::FUSED_MHA); + allocateBuffer(batch_size, seq_len, attention_type != AttentionType::FUSED_MHA); POP_RANGE; sync_check_cuda_error(); - const int m = input_tensors->at("input_query").shape[0]; + const int m = input_tensors->at("input_query").shape[0]; PUSH_RANGE("qkv_gemm"); @@ -83,7 +84,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(q_buf_2_, 0, request_batch_size * request_seq_len * 3 * hidden_units_ * sizeof(T), stream_); + cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); } invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, @@ -92,8 +93,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, attention_weights->query_weight.bias, padding_offset, - request_batch_size, - request_seq_len, + batch_size, + seq_len, m, head_num_, size_per_head_, @@ -108,12 +109,17 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // Use batch major // put k/v_buf from shape [B, H, L, Dh] // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] + // TODO: Cache implementation + // k_cache: [batch_size, num_heads, L, Dh] + // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh] + // v_buf: [batch_size, num_heads, L, Dh] + invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), output_tensors->getPtr("value_cache"), k_buf_2_, v_buf_2_, - request_batch_size, - request_seq_len, + batch_size, + seq_len, max_seq_len, size_per_head_, head_num_, @@ -122,19 +128,16 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // k_cache = (batch_size, num_heads, Dh/x, L, x) // v_cache = (batch_size, num_heads, L, Dh) sync_check_cuda_error(); - - // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh) - POP_RANGE; if (attention_type == AttentionType::FUSED_MHA) { - dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size); + dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); } else { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = request_seq_len; // q length - const int attention_seq_len_2 = request_seq_len; // kv length + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = start_pos + seq_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); // @@ -161,7 +164,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten CUDA_R_32F, attention_seq_len_2, // n attention_seq_len_2 * attention_seq_len_1, - request_batch_size * head_num_, // global batch size + batch_size * head_num_, // global batch size CUDA_R_32F); sync_check_cuda_error(); @@ -172,7 +175,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = request_batch_size; + param.batch_size = batch_size; param.q_length = attention_seq_len_1; param.k_length = attention_seq_len_2; param.num_heads = head_num_; @@ -198,7 +201,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qk_buf_, attention_seq_len_2, attention_seq_len_2 * attention_seq_len_1, - request_batch_size * head_num_); + batch_size * head_num_); POP_RANGE; PUSH_RANGE("softmax"); @@ -206,7 +209,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = request_batch_size; + param.batch_size = batch_size; param.q_length = attention_seq_len_1; param.k_length = attention_seq_len_2; param.num_heads = head_num_; @@ -232,13 +235,13 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_2_, size_per_head_, attention_seq_len_1 * size_per_head_, - request_batch_size * head_num_); + batch_size * head_num_); // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) if (padding_offset == nullptr) { invokeTransposeQKV(qkv_buf_3_, qkv_buf_2_, - request_batch_size, + batch_size, attention_seq_len_1, head_num_, size_per_head_, @@ -251,7 +254,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, qkv_buf_3_, m, - request_batch_size, + batch_size, attention_seq_len_1, head_num_, size_per_head_, @@ -286,8 +289,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( size_t head_num, size_t size_per_head, cudaStream_t stream, @@ -296,8 +298,6 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b bool is_free_buffer_after_forward, bool is_qk_buf_float): BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), - max_batch_size_(max_batch_size), - max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), @@ -307,8 +307,7 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( size_t head_num, size_t size_per_head, size_t local_head_num, @@ -318,8 +317,6 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b bool is_free_buffer_after_forward, bool is_qk_buf_float): BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), - max_batch_size_(max_batch_size), - max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), @@ -331,8 +328,7 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( size_t head_num, size_t size_per_head, size_t local_head_num, @@ -343,8 +339,6 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t max_b bool is_free_buffer_after_forward, bool is_qk_buf_float): BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), - max_batch_size_(max_batch_size), - max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), @@ -361,8 +355,6 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionL attention_layer.cublas_wrapper_, attention_layer.allocator_, attention_layer.is_free_buffer_after_forward_), - max_batch_size_(attention_layer.max_batch_size_), - max_seq_len_(attention_layer.max_seq_len_), head_num_(attention_layer.head_num_), size_per_head_(attention_layer.size_per_head_), hidden_units_(attention_layer.hidden_units_), diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 635d3d15a..85fd74af8 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -27,15 +27,11 @@ namespace fastertransformer { template class LLaMAContextAttentionLayer: public BaseAttentionLayer { private: - // buffer handling - size_t max_batch_size_ = 0; - size_t max_seq_len_ = 0; - // metadata - const size_t head_num_; - const size_t size_per_head_; - const size_t hidden_units_; - const size_t rotary_embedding_dim_; + const size_t head_num_; + const size_t size_per_head_; + const size_t hidden_units_; + const size_t rotary_embedding_dim_; // fmha runner int sm_ = getSMVersion(); @@ -54,19 +50,17 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { protected: using BaseAttentionLayer::allocator_; using BaseAttentionLayer::stream_; - T* qkv_buf_ = nullptr; - T* q_buf_2_ = nullptr; - T* k_buf_2_ = nullptr; - T* v_buf_2_ = nullptr; - T* qk_buf_ = nullptr; - float* qk_buf_float_ = nullptr; - T* qkv_buf_2_ = nullptr; - T* qkv_buf_3_ = nullptr; + T* qkv_buf_ = nullptr; + T* q_buf_2_ = nullptr; + T* k_buf_2_ = nullptr; + T* v_buf_2_ = nullptr; + T* qk_buf_ = nullptr; + float* qk_buf_float_ = nullptr; + T* qkv_buf_2_ = nullptr; + T* qkv_buf_3_ = nullptr; public: - LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, + LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -74,9 +68,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float); - LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, + LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, size_t local_head_num, cudaStream_t stream, @@ -85,9 +77,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float); - LLaMAContextAttentionLayer(size_t max_batch_size, - size_t max_seq_len, - size_t head_num, + LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, size_t local_head_num, size_t rotary_embedding_dim, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 7f9faf463..e4c4e4ee8 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -19,7 +19,9 @@ #include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" +#include "src/fastertransformer/utils/memory_utils.h" #include +#include namespace fastertransformer { @@ -48,30 +50,30 @@ void LLaMA::allocateBuffer() } template -void LLaMA::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) +void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t self_cache_size = - (num_layer_ / pipeline_para_.world_size_) * batch_size * max_cache_seq_len * hidden_units_; + const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_; - input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false)); + input_attention_mask_ = + (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * max_seq_len, false)); decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); - // logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * - // vocab_size_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); + logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); value_cache_ = key_cache_ + self_cache_size; tiled_input_ids_buf_ = - (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, false)); + (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * seq_len, false)); tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false)); - context_decoder_input_buf_ = (T*)(allocator_->reMalloc( - context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); + context_decoder_input_buf_ = + (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( - context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false)); + context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); is_allocate_buffer_ = true; } @@ -82,7 +84,7 @@ void LLaMA::freeBuffer() if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); allocator_->free((void**)(&decoder_output_buf_)); - // allocator_->free((void**)(&logits_buf_)); + allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&key_cache_)); if (cache_indirections_[0] != nullptr) { @@ -100,19 +102,20 @@ void LLaMA::freeBuffer() } template -LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type): +LLaMA::LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -120,6 +123,8 @@ LLaMA::LLaMA(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), + random_seed_(random_seed), + max_seq_len_(max_seq_len), hidden_units_(head_num * size_per_head), attention_type_(attention_type) { @@ -129,21 +134,22 @@ LLaMA::LLaMA(size_t head_num, } template -LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type): +LLaMA::LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -151,6 +157,8 @@ LLaMA::LLaMA(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), + random_seed_(random_seed), + max_seq_len_(max_seq_len), hidden_units_(head_num * size_per_head), pipeline_para_(pipeline_para), attention_type_(attention_type) @@ -167,6 +175,8 @@ LLaMA::LLaMA(LLaMA const& llama): num_layer_(llama.num_layer_), vocab_size_(llama.vocab_size_), rotary_embedding_dim_(llama.rotary_embedding_dim_), + random_seed_(llama.random_seed_), + max_seq_len_(llama.max_seq_len_), hidden_units_(llama.hidden_units_), pipeline_para_(llama.pipeline_para_), attention_type_(llama.attention_type_) @@ -195,49 +205,29 @@ void LLaMA::forward(std::unordered_map* output_ten const LLaMAWeight* llama_weights) { // input_tensors: - // input_ids [batch_size, max_input_length] + // input_ids [batch_size, seq_len] // input_lengths [batch_size] - // output_seq_len [batch_size] on cpu - // min_length [1] or [batch_size] on cpu, optional, int - // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. - // max_cache_seq_len [batch_size] on cpu + // start_pos [1] int on cpu // output_tensors: - // output_logits [batch_size, max_output_seq_len, vocab_size] + // output_logits [batch_size, seq_len, vocab_size] FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); - FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end() - && input_tensors->at("output_seq_len").shape.size() == 1); const size_t batch_size = input_tensors->at("input_ids").shape[0]; // NOTE: Prefix Prompt PreProcessing // get prefix_prompt_weight for each batch --> shape [batch, 1] // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] - int max_input_length = input_tensors->at("input_ids").shape[1]; + int seq_len = input_tensors->at("input_ids").shape[1]; - // Prefix Soft Prompt - const size_t max_output_seq_len = input_tensors->at("output_seq_len").max(); - const size_t max_seq_len = max_output_seq_len; // max cache seq len should include max prefix prompt length as it has k/v states - const size_t max_cache_seq_len = input_tensors->at("max_cache_seq_len").max(); - if (max_cache_seq_len < max_seq_len) { - FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " - "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", - max_cache_seq_len, - max_seq_len); - } - else if (max_cache_seq_len > max_seq_len) { - FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). " - "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.", - max_cache_seq_len, - max_seq_len); - } + const size_t start_pos = input_tensors->at("start_pos").max(); const cudaDataType_t gemm_data_type = getCudaDataType(); - allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length); + allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); const DataType data_type = getTensorType(); @@ -245,10 +235,10 @@ void LLaMA::forward(std::unordered_map* output_ten batch_size, head_num_, size_per_head_ / (16 / sizeof(T)), - max_cache_seq_len, + max_seq_len_, 16 / sizeof(T)}; const std::vector self_v_cache_shape = { - num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_}; + num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}; invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, @@ -256,12 +246,12 @@ void LLaMA::forward(std::unordered_map* output_ten input_tensors->at("input_lengths").getPtr(), batch_size, 1, - max_input_length, + seq_len, stream_); sync_check_cuda_error(); invokeBuildDecoderAttentionMask( - input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_); + input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, seq_len, 0, stream_); sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { @@ -272,8 +262,8 @@ void LLaMA::forward(std::unordered_map* output_ten pPromptTuningParam{}, // no p/prompt tuning tiled_input_ids_buf_, 1, - max_input_length, - max_input_length, + seq_len, + seq_len, // must be same batch_size, hidden_units_, stream_); @@ -282,94 +272,51 @@ void LLaMA::forward(std::unordered_map* output_ten std::unordered_map decoder_input_tensors{ {"decoder_input", - Tensor{ - MEMORY_GPU, data_type, {batch_size, (size_t)max_input_length, hidden_units_}, context_decoder_input_buf_}}, + Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}}, {"attention_mask", - Tensor{MEMORY_GPU, - data_type, - {batch_size, 1, (size_t)max_input_length, (size_t)(max_input_length)}, - input_attention_mask_}}, - {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}}; + Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}, + {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &start_pos}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", - Tensor{MEMORY_GPU, - data_type, - {batch_size, (size_t)max_input_length, hidden_units_}, - context_decoder_output_buf_}}, + Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}}, {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}, - {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size, hidden_units_}, decoder_output_buf_}}}; + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; llama_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { - invokeGeneralLLaMALayerNorm(context_decoder_input_buf_, + invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, llama_weights->post_decoder_layernorm.beta, layernorm_eps_, - batch_size * max_input_length, + batch_size * seq_len, hidden_units_, stream_); sync_check_cuda_error(); - - // FIXME: debugging - T* output_logits = output_tensors->at("output_logits").getPtr(); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, vocab_size_, - batch_size * max_input_length, + batch_size * seq_len, hidden_units_, llama_weights->post_decoder_embedding.kernel, vocab_size_, - context_decoder_input_buf_, + normed_decoder_output_buf_, hidden_units_, // n - output_logits, - // logits_buf_, + logits_buf_, vocab_size_); sync_check_cuda_error(); - } - // sendTensorsToFirstPipelineNode(output_tensors, input_tensors); -} - -template -void LLaMA::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors) -{ - NcclParam tensor_para(0, 1); - - FT_LOG_DEBUG(__PRETTY_FUNCTION__); - if (pipeline_para_.world_size_ == 1) { - // throw errors when detected - ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_); - return; - } - const auto pp_rank = pipeline_para_.rank_; - - ftNcclGroupStart(); - for (auto const& it : *output_tensors) { - if (it.second.data == nullptr) { - continue; - } - - if (pp_rank == pipeline_para_.world_size_ - 1) { - ftNcclSend(it.second.getPtr(), it.second.sizeBytes(), 0, pipeline_para_, stream_); - } - else if (pp_rank == 0) { - ftNcclRecv(it.second.getPtr(), - it.second.sizeBytes(), - pipeline_para_.world_size_ - 1, - pipeline_para_, - stream_); + if (std::is_same::value) { + float* output_logits = output_tensors->at("output_logits").getPtr(); + invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); + sync_check_cuda_error(); } } - ftNcclGroupEnd(); - // throw errors when detected - ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_); } template diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 26d1a6696..dab7a0509 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -35,6 +35,8 @@ class LLaMA: public BaseLayer { size_t num_layer_; size_t vocab_size_; size_t rotary_embedding_dim_; + size_t random_seed_; + size_t max_seq_len_; static constexpr float layernorm_eps_ = 1e-6f; @@ -51,7 +53,7 @@ class LLaMA: public BaseLayer { LLaMAContextDecoder* llama_context_decoder_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len); void freeBuffer() override; void initialize(); @@ -59,8 +61,9 @@ class LLaMA: public BaseLayer { protected: T* input_attention_mask_; T* decoder_output_buf_; + T* normed_decoder_output_buf_; - float* logits_buf_; + T* logits_buf_; T* key_cache_; T* value_cache_; @@ -76,35 +79,37 @@ class LLaMA: public BaseLayer { const std::unordered_map* input_tensors); public: - LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA); - - LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - unsigned long long random_seed, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA); + LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA); + + LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA); LLaMA(LLaMA const& LLaMA); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 08980923a..66fc30b6b 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -26,9 +26,7 @@ namespace fastertransformer { template void LLaMAContextDecoder::initialize() { - self_attention_layer_ = new LLaMAContextAttentionLayer(0, // max_batch_size - 0, // max_seq_len - head_num_, + self_attention_layer_ = new LLaMAContextAttentionLayer(head_num_, size_per_head_, head_num_, rotary_embedding_dim_, @@ -182,11 +180,11 @@ void LLaMAContextDecoder::forward(std::vector* { std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, {"attention_mask", input_tensors->at(1)}, - {"input_lengths", input_tensors->at(2)}}; + {"input_lengths", input_tensors->at(2)}, + {"start_pos", input_tensors->at(3)}}; std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, {"key_cache", output_tensors->at(1)}, - {"value_cache", output_tensors->at(2)}, - {"last_token_hidden_units", output_tensors->at(3)}}; + {"value_cache", output_tensors->at(2)}}; forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight); } @@ -198,27 +196,26 @@ void LLaMAContextDecoder::forward(std::unordered_map* { // input tensors: // decoder_input [batch_size, seq_len, hidden_dimension], - // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] + // attention_mask [batch_size, 1, seq_len, seq_len] // input_lengths [batch_size] + // start_pos [1] // output tensors: // decoder_output [batch_size, seq_len, hidden_dimension], - // key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x] - // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] - // last_token_hidden_units [batch_size, hidden_dimension] + // key_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head] + // value_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head] // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'. // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 3); - FT_CHECK(output_tensors->size() == 4); + FT_CHECK(input_tensors->size() == 4); + FT_CHECK(output_tensors->size() == 3); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; - const int max_prompt_length = - input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; - const DataType data_type = getTensorType(); + const int batch_size = input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int start_pos = input_tensors->at("start_pos").max(); + const DataType data_type = getTensorType(); allocateBuffer(batch_size, seq_len); T* decoder_input = input_tensors->at("decoder_input").getPtr(); @@ -243,15 +240,16 @@ void LLaMAContextDecoder::forward(std::unordered_map* size_t h_token_num = batch_size * seq_len; if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, &h_token_num, padding_offset_, cu_seqlens_, - base_input_lengths, + input_lengths, batch_size, seq_len, stream_); + sync_check_cuda_error(); } for (int l = 0; l < num_layer_; l++) { @@ -262,9 +260,10 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (l == 0 && is_unpadded_mha) { invokeRemovePadding( decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); + sync_check_cuda_error(); } - const bool is_final = false; // TODO(bhsueh) remove this flag + const bool is_final = false; T* layer_input = decoder_layer_output_; T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { @@ -279,6 +278,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + sync_check_cuda_error(); } invokeGeneralLLaMALayerNorm(decoder_normed_input_, @@ -296,10 +296,11 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"attention_mask", Tensor{MEMORY_GPU, data_type, - {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, + {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len)}, attention_mask}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, - {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; + {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, + {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}}; if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", @@ -355,6 +356,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); + sync_check_cuda_error(); } if ((l == num_layer_ - 1) && is_unpadded_mha) { @@ -364,6 +366,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* h_token_num, head_num_ * size_per_head_, stream_); + sync_check_cuda_error(); } } diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 08449b679..7be46c7ed 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -17,22 +17,20 @@ #include "src/fastertransformer/th_op/llama/LLaMA.h" namespace th = torch; -namespace ft = fastertransformer; namespace torch_ext { -LLaMA::LLaMA(const int64_t head_num, - const int64_t size_per_head, - const int64_t inter_size, - const int64_t layer_num, - const int64_t vocab_size, - const int64_t rotary_embedding_dim, - const int64_t start_id, - const int64_t end_id, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, - const int64_t max_seq_len, - const bool use_gptj_residual, - const vector weights): +LLaMA::LLaMA(const int64_t num_heads, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t num_layers, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t random_seed, + const int64_t max_seq_len, + const int64_t tensor_para_size, + const int64_t pipeline_para_size, + const vector weights): + vocab_size_(vocab_size), st_(weights[0].scalar_type()) { for (auto t : weights) { @@ -41,33 +39,29 @@ LLaMA::LLaMA(const int64_t head_num, switch (st_) { case at::ScalarType::Float: - ftllama = new FTLLaMA((size_t)head_num, + ftllama = new FTLLaMA((size_t)num_heads, (size_t)size_per_head, (size_t)inter_size, - (size_t)layer_num, + (size_t)num_layers, (size_t)vocab_size, (size_t)rotary_embedding_dim, - start_id, - end_id, + (size_t)random_seed, + (size_t)max_seq_len, tensor_para_size, pipeline_para_size, - (size_t)max_seq_len, - use_gptj_residual, weights); break; case at::ScalarType::Half: - ftllama = new FTLLaMA((size_t)head_num, + ftllama = new FTLLaMA((size_t)num_heads, (size_t)size_per_head, (size_t)inter_size, - (size_t)layer_num, + (size_t)num_layers, (size_t)vocab_size, (size_t)rotary_embedding_dim, - start_id, - end_id, + (size_t)random_seed, + (size_t)max_seq_len, tensor_para_size, pipeline_para_size, - (size_t)max_seq_len, - use_gptj_residual, weights); break; default: @@ -80,18 +74,8 @@ LLaMA::~LLaMA() delete ftllama; } -std::vector LLaMA::forward(th::Tensor input_ids, - th::Tensor input_lengths, - const int64_t output_len, - th::optional beam_width_opt, - th::optional top_k_opt, - th::optional top_p_opt, - th::optional beam_search_diversity_rate_opt, - th::optional temperature_opt, - th::optional len_penalty_opt, - th::optional repetition_penalty_opt, - th::optional random_seed_opt, - th::optional return_cum_log_probs_opt) +th::Tensor +LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -99,45 +83,13 @@ std::vector LLaMA::forward(th::Tensor input_ids, CHECK_TH_CUDA(input_lengths); CHECK_CONTIGUOUS(input_lengths); TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); - int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0; - if (return_cum_log_probs_opt.has_value()) { - TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1, - "return_cum_log_probs should be" - " 0 (no return cum_log_probs), " - " 1 (the cumulative log probs of generated sequences)") - } - const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1; - - const int batch_size = input_ids.size(0); - const int max_input_length = input_ids.size(1); - const int total_request_output_len = max_input_length + output_len; - th::Tensor output_ids = torch::empty({batch_size, beam_width, total_request_output_len}, - torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); - th::Tensor sequence_lengths = - torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); - th::Tensor cum_log_probs = - torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); - - ftllama->forward(input_ids, - input_lengths, - output_ids, - sequence_lengths, - cum_log_probs, - (const size_t)output_len, - (const size_t)beam_width, - top_k_opt, - top_p_opt, - beam_search_diversity_rate_opt, - temperature_opt, - len_penalty_opt, - repetition_penalty_opt, - random_seed_opt, - return_cum_log_probs_opt); - if (return_cum_log_probs > 0) { - return std::vector{output_ids, sequence_lengths, cum_log_probs}; - } - return std::vector{output_ids, sequence_lengths}; + const int batch_size = input_ids.size(0); + const int seq_len = input_ids.size(1); + th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_}, + torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); + ftllama->forward(output_logits, input_ids, input_lengths, start_pos); + return output_logits; } } // namespace torch_ext @@ -158,7 +110,5 @@ static auto fasterTransformerGptTHS = int64_t, int64_t, int64_t, - int64_t, - bool, std::vector>()) .def("forward", &torch_ext::LLaMA::forward); diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 9a5efa3d0..0d97dc322 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -29,51 +29,35 @@ using std::vector; class IFLLaMA { public: virtual ~IFLLaMA() {} - virtual void forward(th::Tensor& input_ids, - th::Tensor& input_lengths, - th::Tensor& output_ids, - th::Tensor& sequence_lengths, - th::Tensor& cum_log_probs, - const size_t request_output_len, - const size_t beam_width, - th::optional top_k_opt, - th::optional top_p_opt, - th::optional beam_search_diversity_rate_opt, - th::optional temperature_opt, - th::optional len_penalty_opt, - th::optional repetition_penalty_opt, - th::optional random_seed_opt, - th::optional return_cum_log_probs_opt) = 0; + virtual void + forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) = 0; }; template class FTLLaMA: public IFLLaMA { public: - FTLLaMA(const size_t head_num, + FTLLaMA(const size_t num_heads, const size_t size_per_head, const size_t inter_size, - const size_t layer_num, + const size_t num_layers, const size_t vocab_size, const size_t rotary_embedding_dim, - const int start_id, - const int end_id, + const size_t random_seed, + const size_t max_seq_len, const int64_t tensor_para_size, const int64_t pipeline_para_size, - const size_t max_seq_len, - const bool use_gptj_residual, const vector weights): - head_num_(head_num), + num_heads_(num_heads), size_per_head_(size_per_head), inter_size_(inter_size), - layer_num_(layer_num), + num_layers_(num_layers), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), - start_id_(start_id), - end_id_(end_id), - use_gptj_residual_(use_gptj_residual), - weights_(weights), + random_seed_(random_seed), + max_seq_len_(max_seq_len), tensor_para_size_(tensor_para_size), - pipeline_para_size_(pipeline_para_size) + pipeline_para_size_(pipeline_para_size), + weights_(weights) { ft::check_cuda_error(cublasLtCreate(&cublasltHandle_)); cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); @@ -81,40 +65,42 @@ class FTLLaMA: public IFLLaMA { ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size); - llama_weights_.resizeLayer(layer_num_); - for (int i = 0; i < (int)layer_num_; i++) { + llama_weights_.resizeLayer(num_layers_); + for (int i = 0; i < (int)num_layers_; i++) { llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = - get_ptr(weights_[i + 0 * layer_num_]); + get_ptr(weights_[i + 0 * num_layers_]); llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = - get_ptr(weights_[i + 1 * layer_num_]); + get_ptr(weights_[i + 1 * num_layers_]); llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = - get_ptr(weights_[i + 2 * layer_num_]); + get_ptr(weights_[i + 2 * num_layers_]); llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = - get_ptr(weights_[i + 3 * layer_num_]); + get_ptr(weights_[i + 3 * num_layers_]); llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = - get_ptr(weights_[i + 4 * layer_num_]); + get_ptr(weights_[i + 4 * num_layers_]); llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = - get_ptr(weights_[i + 5 * layer_num_]); + get_ptr(weights_[i + 5 * num_layers_]); llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = - get_ptr(weights_[i + 6 * layer_num_]); + get_ptr(weights_[i + 6 * num_layers_]); llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = - get_ptr(weights_[i + 7 * layer_num_]); + get_ptr(weights_[i + 7 * num_layers_]); llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = - get_ptr(weights_[i + 8 * layer_num_]); + get_ptr(weights_[i + 8 * num_layers_]); llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = - get_ptr(weights_[i + 9 * layer_num_]); + get_ptr(weights_[i + 9 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.kernel = + get_ptr(weights_[i + 10 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.bias = + get_ptr(weights_[i + 11 * num_layers_]); llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = - get_ptr(weights_[i + 10 * layer_num_]); + get_ptr(weights_[i + 12 * num_layers_]); llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = - get_ptr(weights_[i + 11 * layer_num_]); + get_ptr(weights_[i + 13 * num_layers_]); } - llama_weights_.pre_decoder_embedding_table = get_ptr(weights_[12 * layer_num_ + 0]); - llama_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[12 * layer_num_ + 1]); - llama_weights_.post_decoder_layernorm.beta = get_ptr(weights_[12 * layer_num_ + 2]); - llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[12 * layer_num_ + 3]); - - //llama_weights_.setMaxSeqLen(max_seq_len); + llama_weights_.pre_decoder_embedding_table = get_ptr(weights_[14 * num_layers_ + 0]); + llama_weights_.post_decoder_layernorm.beta = get_ptr(weights_[14 * num_layers_ + 1]); + llama_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[14 * num_layers_ + 2]); + llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); } @@ -128,24 +114,11 @@ class FTLLaMA: public IFLLaMA { delete cublas_wrapper_mutex_; } - void forward(th::Tensor& input_ids, - th::Tensor& input_lengths, - th::Tensor& output_ids, - th::Tensor& sequence_lengths, - th::Tensor& cum_log_probs, - const size_t request_output_len, - const size_t beam_width, - th::optional top_k_opt, - th::optional top_p_opt, - th::optional beam_search_diversity_rate_opt, - th::optional temperature_opt, - th::optional len_penalty_opt, - th::optional repetition_penalty_opt, - th::optional random_seed_opt, - th::optional return_cum_log_probs_opt) override + virtual void forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + const int64_t start_pos) override { - int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0; - auto stream = at::cuda::getCurrentCUDAStream().stream(); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); cublasSetStream(cublasHandle, stream); @@ -161,8 +134,7 @@ class FTLLaMA: public IFLLaMA { } const size_t request_batch_size = (size_t)input_ids.size(0); - const size_t max_input_length = (size_t)input_ids.size(1); - const int total_output_len = (int)(max_input_length + request_output_len); + const size_t seq_len = (size_t)input_ids.size(1); ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, ft::getSMVersion(), @@ -172,85 +144,41 @@ class FTLLaMA: public IFLLaMA { false, // with_relative_position_bias true); // causal_mask - ft::LLaMA llama = ft::LLaMA(head_num_, + ft::LLaMA llama = ft::LLaMA(num_heads_, size_per_head_, inter_size_, - layer_num_, + num_layers_, vocab_size_, rotary_embedding_dim_, - 0, // random_seed, + random_seed_, + max_seq_len_, + tensor_para_, + pipeline_para_, stream, &cublas_wrapper, &allocator, false, // is_free_buffer_after_forward &prop_, // cuda_device_prop attention_type // attention_type - ); - std::vector output_seq_len(request_batch_size, total_output_len); + ); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, - std::vector{request_batch_size, max_input_length}, + std::vector{request_batch_size, seq_len}, get_ptr(input_ids)}}, {"input_lengths", ft::Tensor{ ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{request_batch_size}, get_ptr(input_lengths)}}, - {"output_seq_len", - ft::Tensor{ - ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}}; - if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) { - input_tensors.insert( - {"beam_search_diversity_rate", - convert_tensor(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (top_p_opt.has_value()) { - input_tensors.insert( - {"runtime_top_p", convert_tensor(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (top_k_opt.has_value()) { - input_tensors.insert( - {"runtime_top_k", convert_tensor(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (temperature_opt.has_value()) { - input_tensors.insert( - {"temperature", convert_tensor(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (len_penalty_opt.has_value()) { - input_tensors.insert( - {"len_penalty", convert_tensor(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (repetition_penalty_opt.has_value()) { - input_tensors.insert({"repetition_penalty", - convert_tensor(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } - if (random_seed_opt.has_value()) { - input_tensors.insert( - {"random_seed", - convert_tensor(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)}); - } + {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector{1}, &start_pos}}}; std::unordered_map output_tensors = std::unordered_map{ - {"output_ids", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_INT32, - std::vector{request_batch_size, beam_width, (size_t)total_output_len}, - get_ptr(output_ids)}}, - {"sequence_length", + {"output_logits", ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_INT32, - std::vector{request_batch_size, beam_width}, - get_ptr(sequence_lengths)}}}; - - if (return_cum_log_probs > 0) { - output_tensors.insert({"cum_log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width}, - get_ptr(cum_log_probs)}}); - } - + ft::TYPE_FP32, + std::vector{request_batch_size, seq_len, vocab_size_}, + get_ptr(output_logits)}}}; try { llama.forward(&output_tensors, &input_tensors, &llama_weights_); } @@ -265,17 +193,16 @@ class FTLLaMA: public IFLLaMA { } private: - const size_t head_num_; + const size_t num_heads_; const size_t size_per_head_; const size_t inter_size_; - const size_t layer_num_; + const size_t num_layers_; const size_t vocab_size_; const size_t rotary_embedding_dim_; - const int start_id_; - const int end_id_; - const bool use_gptj_residual_; - - // const ft::gptVariantParams gpt_variant_params_; + const size_t random_seed_; + const size_t max_seq_len_; + int64_t tensor_para_size_; + int64_t pipeline_para_size_; std::vector weights_; cublasLtHandle_t cublasltHandle_; @@ -286,44 +213,29 @@ class FTLLaMA: public IFLLaMA { ft::NcclParam tensor_para_; ft::NcclParam pipeline_para_; - - int64_t tensor_para_size_; - int64_t pipeline_para_size_; }; class LLaMA: public th::jit::CustomClassHolder { public: - LLaMA(const int64_t head_num, + LLaMA(const int64_t num_heads, const int64_t size_per_head, const int64_t inter_size, - const int64_t layer_num, + const int64_t num_layers, const int64_t vocab_size, const int64_t rotary_embedding_dim, - const int64_t start_id, - const int64_t end_id, + const int64_t random_seed, + const int64_t max_seq_len, const int64_t tensor_para_size, const int64_t pipeline_para_size, - const int64_t max_seq_len, - const bool use_gptj_residual, const vector weights); ~LLaMA(); - vector forward(th::Tensor input_ids, - th::Tensor input_lengths, - const int64_t output_len, - th::optional beam_width_opt, - th::optional top_k_opt, - th::optional top_p_opt, - th::optional beam_search_diversity_rate_opt, - th::optional temperature_opt, - th::optional len_penalty_opt, - th::optional repetition_penalty_opt, - th::optional random_seed_opt, - th::optional return_cum_log_probs_opt); + th::Tensor forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos); private: const at::ScalarType st_; + size_t vocab_size_; IFLLaMA* ftllama; std::vector weights; }; diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu index 134224a09..d795cbf99 100644 --- a/src/fastertransformer/utils/memory_utils.cu +++ b/src/fastertransformer/utils/memory_utils.cu @@ -177,7 +177,7 @@ __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) template void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream) { - cudaCast<<<256, 256, 0, stream>>>(dst, src, size); + cudaCast<<<(size + 255) / 256, 256, 0, stream>>>(dst, src, size); } template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream); From d403986a3691d4f2ab9046727126a114436dbb21 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 23 Sep 2023 21:45:26 +0000 Subject: [PATCH 26/55] debug code --- src/fastertransformer/kernels/gpt_kernels.cu | 435 +++++++++++++++--- src/fastertransformer/kernels/gpt_kernels.h | 9 + .../LLaMAContextAttentionLayer.cc | 111 +++-- src/fastertransformer/models/llama/LLaMA.cc | 129 +++++- .../models/llama/LLaMAContextDecoder.cc | 114 ++++- .../models/llama/LLaMAContextDecoder.h | 1 - src/fastertransformer/th_op/llama/LLaMA.cc | 2 +- src/fastertransformer/th_op/llama/LLaMA.h | 6 +- 8 files changed, 683 insertions(+), 124 deletions(-) diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index 7dc9af620..76852dc4d 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -114,7 +114,70 @@ __global__ void start_id_embedding_position_lookups_kernel(T* length, \ max_length, \ batch_size, \ - hidden_units); + hidden_units) +template +__global__ void start_id_embedding_lookups_kernel(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int64_t hidden_units) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units; + index += blockDim.x * gridDim.x) { + + // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate + // embedding [batch, length, hidden] + const int word_index = index / hidden_units; + const int word_index_row = word_index / length; // batch_id + const int word_index_col = word_index % length; + const int real_word_index = word_index_row * length + word_index_col; + const int col_index = index % hidden_units; + const int input_id = input_ids == nullptr ? real_word_index : input_ids[real_word_index]; + + from_tensor[index] = embedding_table[input_id * hidden_units + col_index]; + } +} + +template +void invokeInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream) +{ + dim3 grid(min(batch_size * length, 65536)); + dim3 block(min(hidden_units, 512)); + start_id_embedding_lookups_kernel + <<>>(from_tensor, embedding_table, input_ids, length, batch_size, hidden_units); +} + +template void invokeInputIdsEmbeddingLookup(float* from_tensor, + const float* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); +template void invokeInputIdsEmbeddingLookup(half* from_tensor, + const half* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); + +#ifdef ENABLE_BF16 +template void invokeInputIdsEmbeddingLookup(__nv_bfloat16* from_tensor, + const __nv_bfloat16* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); +#endif template void invokeInputIdsEmbeddingLookupPosEncoding(T* from_tensor, @@ -203,27 +266,89 @@ template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16* template __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam param) { - // 1. Copy the input ids to output ids and transpose output ids to [seq_len, batch_size, beam_width]. - // 2. Embedding lookup by input ids and concat with soft prompt. The axis of concatenation is on axis of seq_len. - - // Assume batch size is 2 and prompts are [[t1, t2], [t3], [t4, t5]], input_ids are [[s1, s2], [s3], [s4]] - // then the order of output_ids is - // [ [?, ?, s1, s2] - // [?, s3, padding, padding] - // [?, ?, s4, padding] ] - // and the order of embedding is - // [ [t1, t2, s1, s2] - // [t3, s3, padding, padding] - // [t4, t5, s4, padding] ] - // where "?" means undefined values and we should attach it. + // 1. Copy the + // input ids to + // output ids + // and + // transpose + // output ids + // to [seq_len, + // batch_size, + // beam_width]. + // 2. Embedding + // lookup by + // input ids + // and concat + // with soft + // prompt. The + // axis of + // concatenation + // is on axis + // of seq_len. + + // Assume batch + // size is 2 + // and prompts + // are [[t1, + // t2], [t3], + // [t4, t5]], + // input_ids + // are [[s1, + // s2], [s3], + // [s4]] then + // the order of + // output_ids + // is [ [?, ?, + // s1, s2] + // [?, s3, + // padding, + // padding] + // [?, ?, s4, + // padding] ] + // and the + // order of + // embedding is + // [ [t1, t2, + // s1, s2] + // [t3, s3, + // padding, + // padding] + // [t4, t5, + // s4, + // padding] ] + // where "?" + // means + // undefined + // values and + // we should + // attach it. for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < param.batch_size * param.beam_width * (param.max_prefix_soft_prompt_length + param.max_input_length) * param.hidden_units; index += blockDim.x * gridDim.x) { - // transpose the input_ids [batch, length] (part of [batch, beam, max_input_length]) to - // output_ids [length, batch, beam]. - // ouptut_ids need to add padding in the beginning for soft prompting. + // transpose + // the + // input_ids + // [batch, + // length] + // (part of + // [batch, + // beam, + // max_input_length]) + // to + // output_ids + // [length, + // batch, + // beam]. + // ouptut_ids + // need to + // add + // padding + // in the + // beginning + // for soft + // prompting. if (index < param.batch_size * param.beam_width * param.max_input_length) { int tmp_index = index; @@ -239,21 +364,43 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo } } - // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab, - // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length + - // max_prefix_soft_prompt_length, hidden] - int tmp_index = index; - const int hidden_id = tmp_index % param.hidden_units; - tmp_index = (tmp_index - hidden_id) / param.hidden_units; - const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length); - tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length); - const int beam_id = tmp_index % param.beam_width; - tmp_index = (tmp_index - beam_id) / param.beam_width; - const int batch_id = tmp_index % param.batch_size; + // embedding + // lookup + // from + // word ids + // [batch, + // beam, + // length] + // (part of + // [batch, + // beam, + // max_input_length]), + // [vocab, + // hidden] + // and + // [batch, + // max_prefix_soft_prompt_length, + // hidden] + // to + // generate + // embedding + // [batch, + // beam, + // length + + // max_prefix_soft_prompt_length, + // hidden] + int tmp_index = index; + const int hidden_id = tmp_index % param.hidden_units; + tmp_index = (tmp_index - hidden_id) / param.hidden_units; + const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length); + tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length); + const int beam_id = tmp_index % param.beam_width; + tmp_index = (tmp_index - beam_id) / param.beam_width; + const int batch_id = tmp_index % param.batch_size; const int64_t hidden_units = param.hidden_units; - T embedding = + T embedding = (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ? - (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units + (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units + seq_id * hidden_units + hidden_id] : param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length + beam_id * param.max_input_length @@ -292,7 +439,8 @@ template void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt( inputIdsEmbeddingLookupPosEncodingSoftPromptParam<__nv_bfloat16> param); #endif -// TODO Add half2 implementation +// TODO Add half2 +// implementation template __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2) { @@ -329,9 +477,11 @@ invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const i template __global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1) { - // out: [dim1, dim0] - // in: [dim0, dim1] - // in_skipping_dim1: [dim1] + // out: [dim1, + // dim0] in: + // [dim0, dim1] + // in_skipping_dim1: + // [dim1] int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < dim0 * dim1) { @@ -363,8 +513,15 @@ __global__ void buildDecoderAttentionMaskKernel(T* attention_mask, const int max_seq_len, const int max_prompt_length) { - // sequence_lengths: [batch_size] - // attention_mask: [batch_size, 1, max_seq_len, max_seq_len + max_prompt_length] + // sequence_lengths: + // [batch_size] + // attention_mask: + // [batch_size, + // 1, + // max_seq_len, + // max_seq_len + // + + // max_prompt_length] const int max_prompt_seq_length = max_seq_len + max_prompt_length; const int mask_size_per_seq = max_seq_len * max_prompt_seq_length; attention_mask += blockIdx.x * mask_size_per_seq; @@ -581,29 +738,100 @@ template __global__ void find_context_dups(int* shared_contexts, const int* input_ids, const size_t batch_size, const size_t input_seq_len) { - /* We compare all context pairs (i, j), with i (tgt) < j (src) , to detect duplicate - * inputs. If there's a match between i and j, we store i at the - * j-th position of shared_context. So that we know that j can be - * represented by i. shared_contexts is initialized like shared_contexts[i] = i - * and when there's a match, we actually use shared_contexts[j] = min(shared_contexts[j], i) - * so that in the end, shared_contexts effectively contains an index - * to the match with the lowest index context. - * Note that shared_contexts[i] <= i, a property that will be used when uncompacting + /* We compare + * all context + * pairs (i, + * j), with i + * (tgt) < j + * (src) , to + * detect + * duplicate + * inputs. If + * there's a + * match + * between i + * and j, we + * store i at + * the j-th + * position of + * shared_context. + * So that we + * know that j + * can be + * represented + * by i. + * shared_contexts + * is + * initialized + * like + * shared_contexts[i] + * = i and when + * there's a + * match, we + * actually use + * shared_contexts[j] + * = + * min(shared_contexts[j], + * i) so that + * in the end, + * shared_contexts + * effectively + * contains an + * index to the + * match with + * the lowest + * index + * context. + * Note that + * shared_contexts[i] + * <= i, a + * property + * that will be + * used when + * uncompacting * inputs. */ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ bool match; - /* Each block is responsible for a (i, j) pair. To map the block space to - * the i < j space, we need to convert a linear addressing to a triangle, of - * size (batch_size * (batch_size - 1)) / 2 - * For more information, check https://en.wikipedia.org/wiki/Triangular_number + /* Each block + * is + * responsible + * for a (i, j) + * pair. To map + * the block + * space to the + * i < j space, + * we need to + * convert a + * linear + * addressing + * to a + * triangle, of + * size + * (batch_size + * * (batch_size - 1)) / 2 + * For more + * information, + * check + * https://en.wikipedia.org/wiki/Triangular_number */ - // blockIdx = [0, 1, 2, ... n(n-1)/2] -> base_index = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, ..., n - 2] + // blockIdx = + // [0, 1, 2, + // ... + // n(n-1)/2] -> + // base_index = + // [0, 1, 1, 2, + // 2, 2, 3, 3, + // 3, 3, ..., n + // - 2] const int base_index = floorf(0.5f * (sqrtf(1 + 8 * blockIdx.x) - 1)); - const int src_idx = base_index + 1; // base_index \in [1, batch_size) + const int src_idx = base_index + 1; // base_index + // \in + // [1, + // batch_size) const int rev_base_index = base_index * (base_index + 1) / 2; const int tgt_idx = blockIdx.x - rev_base_index; // tgt_idx \in [0, src_idx) @@ -659,9 +887,19 @@ __global__ void generate_dups_indices(int* batch_to_compact, if (!masked && is_first_occur) { int compact_idx = scan + (first_iter ? 0 : scan_offset); - // Context rep. writes initial index + // Context + // rep. + // writes + // initial + // index batch_to_compact[seq_idx * beam_width] = compact_idx; - // input ids are tiled in context part + // input + // ids + // are + // tiled + // in + // context + // part compact_to_batch[compact_idx] = seq_idx * beam_width; } @@ -674,13 +912,27 @@ __global__ void generate_dups_indices(int* batch_to_compact, __syncthreads(); if (!masked && !is_first_occur) { - // Fill the rest of batch_to_compact based on what rep. wrote + // Fill + // the + // rest + // of + // batch_to_compact + // based + // on + // what + // rep. + // wrote const int src_idx = batch_to_compact[shared_contexts[seq_idx] * beam_width]; batch_to_compact[seq_idx * beam_width] = src_idx; } if (!masked) { - // set same compact idx for beams + // set + // same + // compact + // idx + // for + // beams for (int beam_id = 1; beam_id < beam_width; ++beam_id) { batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width]; } @@ -713,11 +965,20 @@ void invokeFindContextDups(int* shared_contexts, { dim3 block{512}; dim3 grid{((int)batch_size + block.x - 1) / block.x}; - // set shared_context[i] = i + // set + // shared_context[i] = + // i init_shared_contexts<<>>(shared_contexts, batch_size); grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2}; - // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]} + // set + // shared_contexts[i] + // = j, where j + // = min{k, + // such that + // input_ids[k] + // == + // input_ids[i]} if (input_seq_len <= 128) { block = 128; find_context_dups<128><<>>(shared_contexts, input_ids, batch_size, input_seq_len); @@ -727,8 +988,21 @@ void invokeFindContextDups(int* shared_contexts, find_context_dups<256><<>>(shared_contexts, input_ids, batch_size, input_seq_len); } - // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch - // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j] + // set + // batch_to_compact[i] + // = j, where j + // is the + // position of + // input_ids[i] + // in the + // compact_batch + // set + // compact_to_batch[i] + // = j, where j + // is such that + // compact_to_batch[i] + // = + // input_ids[j] generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>( batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len); } @@ -782,10 +1056,29 @@ void invokeCompactInputs(T* compact_input, size_t hidden_dimension, cudaStream_t stream) { - /* Compact relevant decoder_layer inputs based on the identical contexts. - * For example, decoder_input is [batch_size, seq_len, H]. It's compacted - * into compact_input [compact_size, seq_len, H] such that - * compact_input[i, ...] = decoder_input[compact_idx[i], ...] */ + /* Compact + * relevant + * decoder_layer + * inputs based + * on the + * identical + * contexts. + * For example, + * decoder_input + * is + * [batch_size, + * seq_len, H]. + * It's + * compacted + * into + * compact_input + * [compact_size, + * seq_len, H] + * such that + * compact_input[i, + * ...] = + * decoder_input[compact_idx[i], + * ...] */ const size_t elems_n = compact_size * seq_len * max(hidden_dimension, seq_len); const dim3 blockDim(512); const dim3 gridDim((elems_n + 512 - 1) / 512); @@ -828,8 +1121,19 @@ __global__ void uncompact_outputs(T* uncompact_buffer, size_t batch_size, size_t buffer_stride) { - /* Uncompact a buffer IN of size [Compact, Stride] into OUT of size [Batch, Stride] - * so that \forall i, OUT[i, :] = IN[batch_to_compact_idx[i], :] + /* Uncompact a + * buffer IN of + * size + * [Compact, + * Stride] into + * OUT of size + * [Batch, + * Stride] so + * that \forall + * i, OUT[i, :] + * = + * IN[batch_to_compact_idx[i], + * :] */ const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -1124,4 +1428,5 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16); #endif #undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION -} // namespace fastertransformer +} // namespace + // fastertransformer diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index d78224e0a..bf4963231 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -59,6 +59,15 @@ struct pPromptTuningParam { const T* request_prompt_embedding = nullptr; }; +template +void invokeInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); + template void invokeInputIdsEmbeddingLookupPosEncoding(T* from_tensor, int* output_ids, diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index f0dfce8c7..a1cb9b81f 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -48,7 +48,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten const int layer_id = input_tensors->getVal("layer_id"); const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - size_t start_pos = input_tensors->at("start_pos").max(); + int start_pos = input_tensors->at("start_pos").max(); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -79,13 +79,54 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); - // IDEA: append prefix prompt key value here - PrefixPromptBatchWeightsParam param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_}; + /* + if (layer_id < 15) { + T* out = (T*)malloc(sizeof(T) * m * 3 * hidden_units_); + T *tmp = out; + cudaMemcpy( + out, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost); + for (int i = 0; i < 3; ++i) { + for (int b = 0; b < batch_size; ++b) { + std::cout << "[\n"; + for (int s = 0; s < 3; ++s) { + std::cout << "[ "; + for (int h = 0; h < 3; ++h) { + std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; + } + std::cout << " ... "; + for (int h = hidden_units_-3; h < hidden_units_; ++h) { + std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; + } + std::cout << "]\n"; + } + std::cout << "...\n"; + for (int s = seq_len-3; s < seq_len; ++s) { + std::cout << "[ "; + for (int h = 0; h < 3; ++h) { + std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; + } + std::cout << " ... "; + for (int h = hidden_units_-3; h < hidden_units_; ++h) { + std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; + } + std::cout << "]\n"; + } + std::cout << "]\n"; + } + std::cout << "\n"; + out += hidden_units_; + } + + free(tmp); + } + */ if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); + sync_check_cuda_error(); } + PrefixPromptBatchWeightsParam param; invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, @@ -105,30 +146,26 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten stream_); sync_check_cuda_error(); - const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length - // Use batch major - // put k/v_buf from shape [B, H, L, Dh] - // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] - // TODO: Cache implementation - // k_cache: [batch_size, num_heads, L, Dh] - // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh] - // v_buf: [batch_size, num_heads, L, Dh] - - invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), - output_tensors->getPtr("value_cache"), - k_buf_2_, - v_buf_2_, - batch_size, - seq_len, - max_seq_len, - size_per_head_, - head_num_, - stream_); - // IDEA : after this, - // k_cache = (batch_size, num_heads, Dh/x, L, x) - // v_cache = (batch_size, num_heads, L, Dh) - sync_check_cuda_error(); - POP_RANGE; + // const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length + // // Use batch major + // // put k/v_buf from shape [B, H, L, Dh] + // // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] + // // TODO: Cache implementation + // // k_cache: [batch_size, num_heads, L, Dh] + // // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh] + // // v_buf: [batch_size, num_heads, L, Dh] + // invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), + // output_tensors->getPtr("value_cache"), + // k_buf_2_, + // v_buf_2_, + // batch_size, + // seq_len, + // max_seq_len, + // size_per_head_, + // head_num_, + // stream_); + // sync_check_cuda_error(); + // POP_RANGE; if (attention_type == AttentionType::FUSED_MHA) { dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); @@ -136,8 +173,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } else { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = seq_len; // q length - const int attention_seq_len_2 = start_pos + seq_len; // kv length + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = seq_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); // @@ -226,16 +263,21 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten size_per_head_, attention_seq_len_1, attention_seq_len_2, + v_buf_2_, size_per_head_, attention_seq_len_2 * size_per_head_, + qk_buf_, attention_seq_len_2, attention_seq_len_1 * attention_seq_len_2, + qkv_buf_2_, size_per_head_, attention_seq_len_1 * size_per_head_, + batch_size * head_num_); + sync_check_cuda_error(); // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) if (padding_offset == nullptr) { @@ -262,6 +304,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten attention_weights->attention_output_weight.scale, 0, // int8_mode stream_); + sync_check_cuda_error(); } POP_RANGE; } @@ -279,6 +322,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten hidden_units_, attention_out, hidden_units_); + sync_check_cuda_error(); POP_RANGE; if (is_free_buffer_after_forward_ == true) { @@ -289,8 +333,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( - size_t head_num, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, @@ -307,8 +350,7 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( - size_t head_num, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, size_t local_head_num, cudaStream_t stream, @@ -328,8 +370,7 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( } template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer( - size_t head_num, +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, size_t local_head_num, size_t rotary_embedding_dim, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index e4c4e4ee8..d21302c61 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -23,8 +23,104 @@ #include #include +#include + namespace fastertransformer { +template +static void _print_tensor1(T* out, int dim1, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + + std::cout << "["; + for (int i = start0; i < end0; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != dim1 - 1) + std::cout << ", "; + } + if (end0 != start1) { + std::cout << "..., "; + } + for (int i = start1; i < end1; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != end1 - 1) + std::cout << ", "; + } + std::cout << "]"; +} + +template +static void _print_tensor2(T* out, int dim1, int dim2, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor1(&out[i * dim2], dim2, indent + 1); + if (i != dim1 - 1) + std::cout << ",\n"; + } + if (end0 != start1) { + std::cout << ind; + std::cout << "...,\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor1(&out[i * dim2], dim2, indent + 1); + if (i != end1 - 1) + std::cout << ",\n"; + } + std::cout << "]"; +} + +template +static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); + if (i != dim1 - 1) + std::cout << ",\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...,\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); + if (i != end1 - 1) + std::cout << ",\n"; + } + std::cout << "]\n"; +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3) +{ + T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3); + cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost); + _print_tensor3(out, dim1, dim2, dim3, 1); + free(out); +} + template void LLaMA::initialize() { @@ -56,7 +152,7 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_; input_attention_mask_ = - (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * max_seq_len, false)); + (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); normed_decoder_output_buf_ = @@ -204,6 +300,7 @@ void LLaMA::forward(std::unordered_map* output_ten const std::unordered_map* input_tensors, const LLaMAWeight* llama_weights) { + // Logger::getLogger().setLevel(Logger::Level::TRACE); // input_tensors: // input_ids [batch_size, seq_len] // input_lengths [batch_size] @@ -224,7 +321,7 @@ void LLaMA::forward(std::unordered_map* output_ten int seq_len = input_tensors->at("input_ids").shape[1]; // max cache seq len should include max prefix prompt length as it has k/v states - const size_t start_pos = input_tensors->at("start_pos").max(); + const int start_pos = input_tensors->at("start_pos").max(); const cudaDataType_t gemm_data_type = getCudaDataType(); allocateBuffer(batch_size, seq_len, max_seq_len_); @@ -243,7 +340,7 @@ void LLaMA::forward(std::unordered_map* output_ten invokeTileGptInputs(tiled_input_ids_buf_, tiled_input_lengths_buf_, input_tensors->at("input_ids").getPtr(), - input_tensors->at("input_lengths").getPtr(), + input_tensors->at("input_lengths").getPtr(), batch_size, 1, seq_len, @@ -255,19 +352,18 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { - invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, - nullptr, - llama_weights->pre_decoder_embedding_table, - llama_weights->position_encoding_table, - pPromptTuningParam{}, // no p/prompt tuning - tiled_input_ids_buf_, - 1, - seq_len, - seq_len, // must be same - batch_size, - hidden_units_, - stream_); + invokeInputIdsEmbeddingLookup(context_decoder_input_buf_, + llama_weights->pre_decoder_embedding_table, + tiled_input_ids_buf_, + seq_len, + batch_size, + hidden_units_, + stream_); sync_check_cuda_error(); + +// std::cout << 0 << "==================" << "EMBEDDING\n"; +// print_tensor3(context_decoder_input_buf_, batch_size, seq_len, hidden_units_); +// std::cout << 0 << "==================" << "EMBEDDING\n"; } std::unordered_map decoder_input_tensors{ @@ -276,7 +372,7 @@ void LLaMA::forward(std::unordered_map* output_ten {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}, - {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &start_pos}}}; + {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", @@ -298,6 +394,7 @@ void LLaMA::forward(std::unordered_map* output_ten hidden_units_, stream_); sync_check_cuda_error(); + cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, vocab_size_, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 66fc30b6b..587118703 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -21,8 +21,104 @@ #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" +#include + namespace fastertransformer { +template +static void _print_tensor1(T* out, int dim1, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + + std::cout << "["; + for (int i = start0; i < end0; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != dim1 - 1) + std::cout << ", "; + } + if (end0 != start1) { + std::cout << "..., "; + } + for (int i = start1; i < end1; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != end1 - 1) + std::cout << ", "; + } + std::cout << "]"; +} + +template +static void _print_tensor2(T* out, int dim1, int dim2, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor1(&out[i * dim2], dim2, indent + 1); + if (i != dim1 - 1) + std::cout << ",\n"; + } + if (end0 != start1) { + std::cout << ind; + std::cout << "...,\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor1(&out[i * dim2], dim2, indent + 1); + if (i != end1 - 1) + std::cout << ",\n"; + } + std::cout << "]"; +} + +template +static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); + if (i != dim1 - 1) + std::cout << ",\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...,\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); + if (i != end1 - 1) + std::cout << ",\n"; + } + std::cout << "]\n"; +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3) +{ + T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3); + cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost); + _print_tensor3(out, dim1, dim2, dim3, 1); + free(out); +} + template void LLaMAContextDecoder::initialize() { @@ -64,8 +160,6 @@ void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); self_attn_output_ = reinterpret_cast( allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - ffn_output_ = reinterpret_cast( - allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); decoder_layer_output_ = reinterpret_cast( allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); @@ -81,7 +175,6 @@ void LLaMAContextDecoder::freeBuffer() if (is_allocate_buffer_ == true) { allocator_->free((void**)(&decoder_normed_input_)); allocator_->free((void**)(&self_attn_output_)); - allocator_->free((void**)(&ffn_output_)); allocator_->free((void**)(&decoder_layer_output_)); allocator_->free((void**)(&h_pinned_token_num_ptr_), true); allocator_->free((void**)(&padding_offset_)); @@ -280,6 +373,13 @@ void LLaMAContextDecoder::forward(std::unordered_map* ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); sync_check_cuda_error(); } +// if (isFirstLayerParallelId(l)) { +// std::cout << l << "==================" << "RECV\n"; +// print_tensor3(layer_input, batch_size, seq_len, hidden_units_); +// std::cout << l << "==================" << "RECV\n"; +// std::cout << std::flush; +// } + invokeGeneralLLaMALayerNorm(decoder_normed_input_, layer_input, @@ -352,6 +452,14 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); + +// if (isLastLayerParallelId(l)) { +// std::cout << l << "==================" << "SEND\n"; +// print_tensor3(layer_input, batch_size, seq_len, hidden_units_); +// std::cout << l << "==================" << "SEND\n"; +// std::cout << std::flush; +// } + if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index 452567208..cb6736f02 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -69,7 +69,6 @@ class LLaMAContextDecoder: public BaseLayer { protected: T* decoder_normed_input_ = nullptr; T* self_attn_output_ = nullptr; - T* ffn_output_ = nullptr; T* decoder_layer_output_ = nullptr; size_t* h_pinned_token_num_ptr_ = nullptr; int* padding_offset_ = nullptr; diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 7be46c7ed..45c1e1575 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -88,7 +88,7 @@ LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t s const int seq_len = input_ids.size(1); th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); - ftllama->forward(output_logits, input_ids, input_lengths, start_pos); + ftllama->forward(output_logits, input_ids, input_lengths, (int)start_pos); return output_logits; } diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 0d97dc322..ab594c5c7 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -30,7 +30,7 @@ class IFLLaMA { public: virtual ~IFLLaMA() {} virtual void - forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) = 0; + forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) = 0; }; template @@ -117,7 +117,7 @@ class FTLLaMA: public IFLLaMA { virtual void forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, - const int64_t start_pos) override + const int start_pos) override { auto stream = at::cuda::getCurrentCUDAStream().stream(); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); @@ -171,7 +171,7 @@ class FTLLaMA: public IFLLaMA { {"input_lengths", ft::Tensor{ ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{request_batch_size}, get_ptr(input_lengths)}}, - {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector{1}, &start_pos}}}; + {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &start_pos}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_logits", From db6efddf6e3112548dc1ce8bbc6234b9646c6e57 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 24 Sep 2023 00:12:35 +0000 Subject: [PATCH 27/55] dump --- .../kernels/layernorm_kernels.cu | 47 ++--- .../kernels/layernorm_kernels.h | 1 - .../LLaMAContextAttentionLayer.cc | 73 ++++---- src/fastertransformer/models/llama/LLaMA.cc | 98 +--------- .../models/llama/LLaMAContextDecoder.cc | 138 +++------------ src/fastertransformer/utils/llama_utils.h | 167 ++++++++++++++++++ 6 files changed, 232 insertions(+), 292 deletions(-) create mode 100644 src/fastertransformer/utils/llama_utils.h diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index 6244dbfd6..80a656cf7 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -2511,56 +2511,36 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16* out, /******************* invokeGeneralLLaMALayerNorm ***********************/ template -__global__ void generalLLaMALayerNorm(const T* __restrict input, - const T* __restrict gamma, - const T* __restrict beta, - T* normed_output, - const float layernorm_eps, - int m, - int n) +__global__ void generalLLaMALayerNorm( + const T* __restrict input, const T* __restrict gamma, T* normed_output, const float layernorm_eps, int m, int n) { const int tid = threadIdx.x; - extern __shared__ __align__(sizeof(float)) char _shmem[]; - T* shmem = reinterpret_cast(_shmem); - - __shared__ float s_variance; - float variance = 0.0f; - - using Float_Packed_T = typename packed_as::value>::type; - using Scalar_T = typename packed_as::type; - float local_var_sum = 0.0f; for (int i = tid; i < n; i += blockDim.x) { float val = (float)(ldg(&input[blockIdx.x * n + i])); local_var_sum += val * val; } - variance = blockReduceSum(local_var_sum); + float variance = 0.0f; + variance = blockReduceSum(local_var_sum); + __shared__ float s_variance; if (threadIdx.x == 0) { - s_variance = rsqrtf(variance / (float)n + layernorm_eps); + s_variance = rsqrtf((variance / (float)n) + layernorm_eps); } __syncthreads(); for (int i = tid; i < n; i += blockDim.x) { - const int index = blockIdx.x * n + i; - float beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]); - T val = (T)(((float)input[index] * s_variance) * (float)(ldg(&gamma[i])) + beta_val); - - normed_output[index] = val; + const int index = blockIdx.x * n + i; + T val = (T) (((float)ldg(&input[index])) * s_variance); + normed_output[index] = val * ldg(&gamma[i]); } } template -void invokeGeneralLLaMALayerNorm(T* out, - const T* input, - const T* gamma, - const T* beta, - const float layernorm_eps, - const int m, - const int n, - cudaStream_t stream) +void invokeGeneralLLaMALayerNorm( + T* out, const T* input, const T* gamma, const float layernorm_eps, const int m, const int n, cudaStream_t stream) { dim3 grid(m); dim3 block(min(n, 1024)); @@ -2572,13 +2552,12 @@ void invokeGeneralLLaMALayerNorm(T* out, block.x = 1024; } - generalLLaMALayerNorm<<>>(input, gamma, beta, out, layernorm_eps, m, n); + generalLLaMALayerNorm<<>>(input, gamma, out, layernorm_eps, m, n); } template void invokeGeneralLLaMALayerNorm(float* out, const float* input, const float* gamma, - const float* beta, const float layernorm_eps, const int m, const int n, @@ -2586,7 +2565,6 @@ template void invokeGeneralLLaMALayerNorm(float* out, template void invokeGeneralLLaMALayerNorm(half* out, const half* input, const half* gamma, - const half* beta, const float layernorm_eps, const int m, const int n, @@ -2595,7 +2573,6 @@ template void invokeGeneralLLaMALayerNorm(half* out, template void invokeGeneralLLaMALayerNorm(__nv_bfloat16* out, const __nv_bfloat16* input, const __nv_bfloat16* gamma, - const __nv_bfloat16* beta, const float layernorm_eps, const int m, const int n, diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h index 8fb8ecf8b..c7b31e874 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.h +++ b/src/fastertransformer/kernels/layernorm_kernels.h @@ -180,7 +180,6 @@ template void invokeGeneralLLaMALayerNorm(T* out, const T* input, const T* gamma, - const T* beta, const float layernorm_eps, const int m, const int n, diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index a1cb9b81f..151a1012c 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -18,6 +18,7 @@ #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" #include "src/fastertransformer/kernels/layernorm_kernels.h" #include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/utils/llama_utils.h" #include "src/fastertransformer/utils/nvtx_utils.h" namespace fastertransformer { @@ -79,58 +80,25 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); - /* - if (layer_id < 15) { - T* out = (T*)malloc(sizeof(T) * m * 3 * hidden_units_); - T *tmp = out; - cudaMemcpy( - out, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost); - for (int i = 0; i < 3; ++i) { - for (int b = 0; b < batch_size; ++b) { - std::cout << "[\n"; - for (int s = 0; s < 3; ++s) { - std::cout << "[ "; - for (int h = 0; h < 3; ++h) { - std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; - } - std::cout << " ... "; - for (int h = hidden_units_-3; h < hidden_units_; ++h) { - std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; - } - std::cout << "]\n"; - } - std::cout << "...\n"; - for (int s = seq_len-3; s < seq_len; ++s) { - std::cout << "[ "; - for (int h = 0; h < 3; ++h) { - std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; - } - std::cout << " ... "; - for (int h = hidden_units_-3; h < hidden_units_; ++h) { - std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " "; - } - std::cout << "]\n"; - } - std::cout << "]\n"; - } - std::cout << "\n"; - out += hidden_units_; - } - - free(tmp); + if (true) { + print_tensor3(qkv_buf_, + batch_size, + seq_len, + hidden_units_, + seq_len * hidden_units_ * 3, + hidden_units_ * 3, + batch_size * seq_len * hidden_units_ * 3, + 2*hidden_units_); } - */ if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); sync_check_cuda_error(); } - PrefixPromptBatchWeightsParam param; invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, - param, // prefix prompt qkv_buf_, attention_weights->query_weight.bias, padding_offset, @@ -139,12 +107,31 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten m, head_num_, size_per_head_, - rotary_embedding_dim_, + stream_); + /* + invokeAddFusedQKVBiasTranspose(q_buf_2_, + k_buf_2_, + v_buf_2_, + PrefixPromptBatchWeightsParam{}, + qkv_buf_, + attention_weights->query_weight.bias, + padding_offset, + batch_size, + seq_len, + m, + head_num_, + size_per_head_, + //rotary_embedding_dim_, + 0, false, attention_weights->query_weight.scale_out, 0, // int8_mode stream_); + */ sync_check_cuda_error(); + // if (true) { + // print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + // } // const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // // Use batch major diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index d21302c61..cdf8071e6 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -20,107 +20,12 @@ #include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/fastertransformer/utils/memory_utils.h" +#include "src/fastertransformer/utils/llama_utils.h" #include #include -#include - namespace fastertransformer { -template -static void _print_tensor1(T* out, int dim1, int indent) -{ - std::string ind(indent, ' '); - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - - std::cout << "["; - for (int i = start0; i < end0; ++i) { - std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; - if (i != dim1 - 1) - std::cout << ", "; - } - if (end0 != start1) { - std::cout << "..., "; - } - for (int i = start1; i < end1; ++i) { - std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; - if (i != end1 - 1) - std::cout << ", "; - } - std::cout << "]"; -} - -template -static void _print_tensor2(T* out, int dim1, int dim2, int indent) -{ - std::string ind(indent, ' '); - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - std::cout << "["; - for (int i = start0; i < end0; ++i) { - if (i != start0) - std::cout << ind; - _print_tensor1(&out[i * dim2], dim2, indent + 1); - if (i != dim1 - 1) - std::cout << ",\n"; - } - if (end0 != start1) { - std::cout << ind; - std::cout << "...,\n"; - } - for (int i = start1; i < end1; ++i) { - std::cout << ind; - _print_tensor1(&out[i * dim2], dim2, indent + 1); - if (i != end1 - 1) - std::cout << ",\n"; - } - std::cout << "]"; -} - -template -static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent) -{ - std::string ind(indent, ' '); - - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - std::cout << "["; - for (int i = start0; i < end0; ++i) { - if (i != start0) - std::cout << ind; - _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); - if (i != dim1 - 1) - std::cout << ",\n\n"; - } - if (start1 != end1) { - std::cout << ind; - std::cout << "...,\n"; - } - for (int i = start1; i < end1; ++i) { - std::cout << ind; - _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); - if (i != end1 - 1) - std::cout << ",\n"; - } - std::cout << "]\n"; -} - -template -static void print_tensor3(T* in, int dim1, int dim2, int dim3) -{ - T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3); - cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost); - _print_tensor3(out, dim1, dim2, dim3, 1); - free(out); -} - template void LLaMA::initialize() { @@ -388,7 +293,6 @@ void LLaMA::forward(std::unordered_map* output_ten invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, - llama_weights->post_decoder_layernorm.beta, layernorm_eps_, batch_size * seq_len, hidden_units_, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 587118703..275c61ad3 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -20,105 +20,10 @@ #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" - -#include +#include "src/fastertransformer/utils/llama_utils.h" namespace fastertransformer { -template -static void _print_tensor1(T* out, int dim1, int indent) -{ - std::string ind(indent, ' '); - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - - std::cout << "["; - for (int i = start0; i < end0; ++i) { - std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; - if (i != dim1 - 1) - std::cout << ", "; - } - if (end0 != start1) { - std::cout << "..., "; - } - for (int i = start1; i < end1; ++i) { - std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; - if (i != end1 - 1) - std::cout << ", "; - } - std::cout << "]"; -} - -template -static void _print_tensor2(T* out, int dim1, int dim2, int indent) -{ - std::string ind(indent, ' '); - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - std::cout << "["; - for (int i = start0; i < end0; ++i) { - if (i != start0) - std::cout << ind; - _print_tensor1(&out[i * dim2], dim2, indent + 1); - if (i != dim1 - 1) - std::cout << ",\n"; - } - if (end0 != start1) { - std::cout << ind; - std::cout << "...,\n"; - } - for (int i = start1; i < end1; ++i) { - std::cout << ind; - _print_tensor1(&out[i * dim2], dim2, indent + 1); - if (i != end1 - 1) - std::cout << ",\n"; - } - std::cout << "]"; -} - -template -static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent) -{ - std::string ind(indent, ' '); - - int start0 = 0; - int end0 = (dim1 < 3) ? dim1 : 3; - int start1 = (dim1 < 3) ? 0 : dim1 - 3; - int end1 = (dim1 < 3) ? 0 : dim1; - std::cout << "["; - for (int i = start0; i < end0; ++i) { - if (i != start0) - std::cout << ind; - _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); - if (i != dim1 - 1) - std::cout << ",\n\n"; - } - if (start1 != end1) { - std::cout << ind; - std::cout << "...,\n"; - } - for (int i = start1; i < end1; ++i) { - std::cout << ind; - _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1); - if (i != end1 - 1) - std::cout << ",\n"; - } - std::cout << "]\n"; -} - -template -static void print_tensor3(T* in, int dim1, int dim2, int dim3) -{ - T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3); - cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost); - _print_tensor3(out, dim1, dim2, dim3, 1); - free(out); -} - template void LLaMAContextDecoder::initialize() { @@ -351,8 +256,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { - invokeRemovePadding( - decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); + invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); } @@ -373,24 +277,23 @@ void LLaMAContextDecoder::forward(std::unordered_map* ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); sync_check_cuda_error(); } -// if (isFirstLayerParallelId(l)) { -// std::cout << l << "==================" << "RECV\n"; -// print_tensor3(layer_input, batch_size, seq_len, hidden_units_); -// std::cout << l << "==================" << "RECV\n"; -// std::cout << std::flush; -// } - invokeGeneralLLaMALayerNorm(decoder_normed_input_, layer_input, llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, layernorm_eps_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); + if (true) { + std::cout << l << "==================" << "ATTN_NORM\n"; + print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_); + std::cout << l << "==================" << "ATTN_NORM\n"; + std::cout << std::flush; + } + TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, {"attention_mask", @@ -419,13 +322,24 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + std::cout << l << "==================" << "QBUF\n"; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); + std::cout << l << "==================" << "QBUF\n"; + std::cout << std::flush; + +// if (true) { +// std::cout << l << "==================" << "ATTENTION\n"; +// print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_); +// std::cout << l << "==================" << "ATTENTION\n"; +// std::cout << std::flush; +// } + invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, - layer_input, + decoder_normed_input_, self_attn_output_, layer_input, llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, @@ -438,7 +352,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); TensorMap ffn_input_tensors( - {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}}); + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); TensorMap ffn_output_tensors( {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); @@ -452,14 +366,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); - -// if (isLastLayerParallelId(l)) { -// std::cout << l << "==================" << "SEND\n"; -// print_tensor3(layer_input, batch_size, seq_len, hidden_units_); -// std::cout << l << "==================" << "SEND\n"; -// std::cout << std::flush; -// } - if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h new file mode 100644 index 000000000..c1c2632c7 --- /dev/null +++ b/src/fastertransformer/utils/llama_utils.h @@ -0,0 +1,167 @@ +#include +#include +#include + +namespace fastertransformer { + +template +static void _print_tensor1(T* out, int dim1, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + + std::cout << "["; + for (int i = start0; i < end0; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != dim1 - 1) + std::cout << " "; + } + if (end0 != start1) { + std::cout << "... "; + } + for (int i = start1; i < end1; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != end1 - 1) + std::cout << " "; + } + std::cout << "]"; +} + +template +static void _print_tensor2(T* out, int dim1, int dim2, int stride, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor1(&out[i * stride], dim2, indent + 1); + if (i != dim1 - 1) + std::cout << "\n"; + } + if (end0 != start1) { + std::cout << ind; + std::cout << "...\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor1(&out[i * stride], dim2, indent + 1); + if (i != end1 - 1) + std::cout << "\n"; + } + std::cout << "]"; +} + +template +static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, int stride2, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1); + if (i != dim1 - 1) + std::cout << "\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...\n\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1); + if (i != end1 - 1) + std::cout << "\n"; + } + std::cout << "]\n"; +} + +template +static void +_print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); + if (i != dim1 - 1) + std::cout << "\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...\n\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); + if (i != end1 - 1) + std::cout << "\n"; + } + std::cout << "]\n"; +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start) +{ + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1); + + /* + if (stride2 != dim3) { + for (int i = dim1 * dim2 * 3 * dim3 - 1 * dim3 - 8; i < dim1 * dim2 * 3 * dim3 - 1 * dim3; ++i) { + std::cout << out[i] << " "; + } + std::cout << "\n"; + } + */ + free(out); +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3) +{ + print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0); +} + +template +static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4) +{ + print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4); +} + +template +static void +print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start) +{ + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1); + for (int i = dim1 * dim2 * dim3 * dim4 - 8; i < dim1 * dim2 * dim3 * dim4; ++i) { + std::cout << out[i] << " "; + } + std::cout << "\n"; + free(out); +} + +} // namespace fastertransformer From 6e099590f0e670a1eaa4272789702982ce02586c Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 24 Sep 2023 00:41:15 +0000 Subject: [PATCH 28/55] dmpdmp --- .../kernels/unfused_attention_kernels.cu | 121 ++++++++++++++++-- .../kernels/unfused_attention_kernels.h | 13 ++ .../LLaMAContextAttentionLayer.cc | 54 ++++---- src/fastertransformer/utils/llama_utils.h | 12 +- 4 files changed, 156 insertions(+), 44 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 61d2a54ff..3e729b7e0 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -278,14 +278,14 @@ __global__ void softmax_kernel(T* attn_score, // Loop along with Q dimension. for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) { - float data[ITEMS_PER_THREAD]; - int64_t qk_offset; - float local_max = -1e20f; + float data[ITEMS_PER_THREAD]; + int64_t qk_offset; + float local_max = -1e20f; // Loop along with K dimension. for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { - int64_t ki = blockDim.x * i + threadIdx.x; // Index of K dimension. - qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki; + int64_t ki = blockDim.x * i + threadIdx.x; // Index of K dimension. + qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki; float qk_val = static_cast(qk[qk_offset]); float qk_bias = 0.0f; @@ -297,8 +297,8 @@ __global__ void softmax_kernel(T* attn_score, qk_bias += static_cast(linear_bias_slope * (ki - qi)); } - int64_t mask_offset = (bi * q_length + qi) * k_length + ki; - float mask_val = static_cast(ldg(&attn_mask[mask_offset])); + int64_t mask_offset = (bi * q_length + qi) * k_length + ki; + float mask_val = static_cast(ldg(&attn_mask[mask_offset])); qk_bias += (1.0f - mask_val) * -10000.0f; data[i] = qk_scale * qk_val + qk_bias; @@ -1363,8 +1363,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int head_idx = blockIdx.y; const int tidx = threadIdx.x; - const int total_seq_len = param.max_prefix_prompt_length + seq_len; - const bool is_masked = tidx * vec_size >= size_per_head; + const int total_seq_len = param.max_prefix_prompt_length + seq_len; + const bool is_masked = tidx * vec_size >= size_per_head; // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts if (PREFIX_PROMPT && token_idx < 0) { @@ -1581,6 +1581,109 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16); #endif #undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSE +template +__global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head) +{ + // QKV: [token_num, 3, n] + // qkv_bias: [3, n] + // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head] + + T* qkv_ptr[3] = {q_buf, k_buf, v_buf}; + const int n = head_num * size_per_head; + for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < token_num * 3 * n; + index += gridDim.x * blockDim.x) { + + const int token_idx = index / (3 * n); + const int token_padded_idx = token_idx + (padding_offset == nullptr ? 0 : padding_offset[token_idx]); + const int target_batch_id = token_padded_idx / seq_len; + const int seq_id = token_padded_idx % seq_len; + + const int qkv_id = (index % (3 * n)) / n; + const int head_id = (index % n) / size_per_head; + const int size_id = index % size_per_head; + + T val = ldg(&QKV[index]); + QKV[index] = val; + qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head + + seq_id * size_per_head + size_id] = val; + } +} + +template +void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + cudaStream_t stream) +{ + const int m = token_num; + const int n = head_num * size_per_head; + dim3 block(384); + dim3 grid((int)(ceil(1.0 * m * n / 384))); + llama_add_fusedQKV_bias_transpose_kernel<<>>(q_buf, + k_buf, + v_buf, + QKV, + padding_offset, + batch_size, + seq_len, + token_num, + head_num, + size_per_head); +} + +template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, + float* k_buf, + float* v_buf, + float* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + cudaStream_t stream); + +template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, + half* k_buf, + half* v_buf, + half* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, + __nv_bfloat16* k_buf, + __nv_bfloat16* v_buf, + __nv_bfloat16* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + cudaStream_t stream); +#endif + template __global__ void transpose_4d(T* dst, T* src, diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 7ac7604d4..5f8cd0669 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -113,6 +113,19 @@ struct PrefixPromptBatchWeightsParam { const size_t prefix_prompt_layer_offset_per_seq = 0; }; +template +void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + cudaStream_t stream); + template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 151a1012c..dbf447707 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -80,34 +80,30 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); - if (true) { - print_tensor3(qkv_buf_, - batch_size, - seq_len, - hidden_units_, - seq_len * hidden_units_ * 3, - hidden_units_ * 3, - batch_size * seq_len * hidden_units_ * 3, - 2*hidden_units_); - } - if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); sync_check_cuda_error(); } - invokeAddFusedQKVBiasTranspose(q_buf_2_, - k_buf_2_, - v_buf_2_, - qkv_buf_, - attention_weights->query_weight.bias, - padding_offset, - batch_size, - seq_len, - m, - head_num_, - size_per_head_, - stream_); + invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, + k_buf_2_, + v_buf_2_, + qkv_buf_, + nullptr, // padding_offset, + batch_size, + seq_len, + m, + head_num_, + size_per_head_, + stream_); + if (true) { + std::cout << "batch_size: " << batch_size << "\n"; + std::cout << "head_num_: " << head_num_ << "\n"; + std::cout << "seq_len: " << seq_len << "\n"; + std::cout << "size_per_head_: " << size_per_head_ << "\n"; + print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + } + /* invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, @@ -408,26 +404,26 @@ template void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false); + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true); + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true); k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false); + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true); } else { allocator_->free((void**)(&qk_buf_)); qk_buf_ = nullptr; } - qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); - qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true); if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true); } else { allocator_->free((void**)(&qk_buf_float_)); diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h index c1c2632c7..8f6dcf5ff 100644 --- a/src/fastertransformer/utils/llama_utils.h +++ b/src/fastertransformer/utils/llama_utils.h @@ -144,12 +144,6 @@ static void print_tensor3(T* in, int dim1, int dim2, int dim3) print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0); } -template -static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4) -{ - print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4); -} - template static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start) @@ -164,4 +158,10 @@ print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int st free(out); } +template +static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4) +{ + print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4, 0); +} + } // namespace fastertransformer From cf8087a4d03be32932b606f13c17f2c3658c122b Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 24 Sep 2023 00:41:58 +0000 Subject: [PATCH 29/55] dp --- .../layers/attention_layers/LLaMAContextAttentionLayer.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index dbf447707..b00c8b991 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -97,10 +97,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten size_per_head_, stream_); if (true) { - std::cout << "batch_size: " << batch_size << "\n"; - std::cout << "head_num_: " << head_num_ << "\n"; - std::cout << "seq_len: " << seq_len << "\n"; - std::cout << "size_per_head_: " << size_per_head_ << "\n"; print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); } From 13478f430048337d875aec015a4eb81189e80730 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 25 Sep 2023 09:30:35 +0000 Subject: [PATCH 30/55] dmp --- .../kernels/unfused_attention_kernels.cu | 78 +++++++++------- .../kernels/unfused_attention_kernels.h | 1 + .../LLaMAContextAttentionLayer.cc | 93 +++++++++++++------ .../models/llama/LLaMAContextDecoder.cc | 37 ++++---- src/fastertransformer/utils/llama_utils.h | 23 ++--- 5 files changed, 136 insertions(+), 96 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 3e729b7e0..97df58261 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1589,32 +1589,50 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, const int* padding_offset, const int batch_size, const int seq_len, - const int token_num, const int head_num, - const int size_per_head) + const int size_per_head, + const int rotary_embedding_dim) { - // QKV: [token_num, 3, n] - // qkv_bias: [3, n] - // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head] + constexpr int vec_size = Vec_t::size; + using Vec_t = typename Vec_t::Type; + const int token_idx = blockIdx.x; + const int token_padding_offset = (padding_offset == nullptr || token_idx < 0) ? 0 : padding_offset[token_idx]; + const int tgt_token_idx = token_idx + token_padding_offset; - T* qkv_ptr[3] = {q_buf, k_buf, v_buf}; + const int batch_idx = tgt_token_idx / seq_len; + const int seq_idx = tgt_token_idx % seq_len; + + const int head_idx = blockIdx.y; + const int tidx = threadIdx.x; + + const bool is_masked = tidx * vec_size >= size_per_head; + + const int hidden_idx = head_idx * size_per_head + tidx * vec_size; const int n = head_num * size_per_head; - for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < token_num * 3 * n; - index += gridDim.x * blockDim.x) { - const int token_idx = index / (3 * n); - const int token_padded_idx = token_idx + (padding_offset == nullptr ? 0 : padding_offset[token_idx]); - const int target_batch_id = token_padded_idx / seq_len; - const int seq_id = token_padded_idx % seq_len; + const int src_q_idx = token_idx * 3 * n + hidden_idx; + const int src_k_idx = token_idx * 3 * n + hidden_idx + n; + const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; - const int qkv_id = (index % (3 * n)) / n; - const int head_id = (index % n) / size_per_head; - const int size_id = index % size_per_head; + Vec_t q, k, v; + if (!is_masked) { + q = *reinterpret_cast(&QKV[src_q_idx]); + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); + } - T val = ldg(&QKV[index]); - QKV[index] = val; - qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head - + seq_id * size_per_head + size_id] = val; + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, seq_idx); + + const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + + const int dest_kv_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + + if (!is_masked) { + *reinterpret_cast(&q_buf[dest_q_idx]) = q; + *reinterpret_cast(&k_buf[dest_kv_idx]) = k; + *reinterpret_cast(&v_buf[dest_kv_idx]) = v; } } @@ -1629,22 +1647,13 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int token_num, const int head_num, const int size_per_head, + const int rotary_embedding_dim, cudaStream_t stream) { - const int m = token_num; - const int n = head_num * size_per_head; - dim3 block(384); - dim3 grid((int)(ceil(1.0 * m * n / 384))); - llama_add_fusedQKV_bias_transpose_kernel<<>>(q_buf, - k_buf, - v_buf, - QKV, - padding_offset, - batch_size, - seq_len, - token_num, - head_num, - size_per_head); + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(token_num, head_num); + llama_add_fusedQKV_bias_transpose_kernel<<>>( + q_buf, k_buf, v_buf, QKV, padding_offset, batch_size, seq_len, head_num, size_per_head, rotary_embedding_dim); } template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, @@ -1657,6 +1666,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, const int token_num, const int head_num, const int size_per_head, + const int rotary_embedding_dim, cudaStream_t stream); template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, @@ -1669,6 +1679,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, const int token_num, const int head_num, const int size_per_head, + const int rotary_embedding_dim, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, @@ -1681,6 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, const int token_num, const int head_num, const int size_per_head, + const int rotary_embedding_dim, cudaStream_t stream); #endif diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 5f8cd0669..0ccf64d8c 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -124,6 +124,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int token_num, const int head_num, const int size_per_head, + const int rotary_embedding_dim, cudaStream_t stream); template diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index b00c8b991..5aecc5de6 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -80,50 +80,83 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); +// if (true) { +// print_tensor3(qkv_buf_, +// batch_size, +// seq_len, +// hidden_units_, +// seq_len * 3 * hidden_units_, +// 3 * hidden_units_, +// batch_size * seq_len * 3 * hidden_units_, +// 0); +// print_tensor3(qkv_buf_, +// batch_size, +// seq_len, +// hidden_units_, +// seq_len * 3 * hidden_units_, +// 3 * hidden_units_, +// batch_size * seq_len * 3 * hidden_units_, +// hidden_units_); +// print_tensor3(qkv_buf_, +// batch_size, +// seq_len, +// hidden_units_, +// seq_len * 3 * hidden_units_, +// 3 * hidden_units_, +// batch_size * seq_len * 3 * hidden_units_, +// 2*hidden_units_); +// } +// if (true) { +// print_tensor4(qkv_buf_, +// batch_size, seq_len, head_num_, size_per_head_, +// seq_len * 3 * head_num_ * size_per_head_, +// 3 * head_num_ * size_per_head_, +// size_per_head_, +// batch_size * seq_len * 3 * head_num_ * size_per_head_, +// 0 +// ); +// print_tensor4(qkv_buf_, +// batch_size, seq_len, head_num_, size_per_head_, +// seq_len * 3 * head_num_ * size_per_head_, +// 3 * head_num_ * size_per_head_, +// size_per_head_, +// batch_size * seq_len * 3 * head_num_ * size_per_head_, +// head_num_ * size_per_head_ +// ); +// print_tensor4(qkv_buf_, +// batch_size, seq_len, head_num_, size_per_head_, +// seq_len * 3 * head_num_ * size_per_head_, +// 3 * head_num_ * size_per_head_, +// size_per_head_, +// batch_size * seq_len * 3 * head_num_ * size_per_head_, +// 2 * head_num_ * size_per_head_ +// ); +// } + if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); + cudaMemsetAsync(k_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); sync_check_cuda_error(); } - invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, + invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, qkv_buf_, - nullptr, // padding_offset, + padding_offset, batch_size, seq_len, m, head_num_, size_per_head_, + rotary_embedding_dim_, stream_); - if (true) { - print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - } - - /* - invokeAddFusedQKVBiasTranspose(q_buf_2_, - k_buf_2_, - v_buf_2_, - PrefixPromptBatchWeightsParam{}, - qkv_buf_, - attention_weights->query_weight.bias, - padding_offset, - batch_size, - seq_len, - m, - head_num_, - size_per_head_, - //rotary_embedding_dim_, - 0, - false, - attention_weights->query_weight.scale_out, - 0, // int8_mode - stream_); - */ sync_check_cuda_error(); - // if (true) { - // print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - // } +// if (true) { +// print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); +// print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); +// print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_); +// } + // const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // // Use batch major diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 275c61ad3..9ae318554 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -256,7 +256,8 @@ void LLaMAContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { - invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); + invokeRemovePadding( + decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); } @@ -287,10 +288,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - if (true) { - std::cout << l << "==================" << "ATTN_NORM\n"; + if (false) { + std::cout << l << "==================" + << "ATTN_NORM\n"; print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_); - std::cout << l << "==================" << "ATTN_NORM\n"; + std::cout << l << "==================" + << "ATTN_NORM\n"; std::cout << std::flush; } @@ -322,20 +325,21 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; - std::cout << l << "==================" << "QBUF\n"; + // std::cout << l << "==================" << "QBUF\n"; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - std::cout << l << "==================" << "QBUF\n"; - std::cout << std::flush; - -// if (true) { -// std::cout << l << "==================" << "ATTENTION\n"; -// print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_); -// std::cout << l << "==================" << "ATTENTION\n"; -// std::cout << std::flush; -// } - + // std::cout << l << "==================" << "QBUF\n"; + // std::cout << std::flush; + + if (false) { + std::cout << l << "==================" + << "ATTENTION\n"; + print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_); + std::cout << l << "==================" + << "ATTENTION\n"; + std::cout << std::flush; + } invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, @@ -352,7 +356,8 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); TensorMap ffn_input_tensors( - {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); + {{"ffn_input", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); TensorMap ffn_output_tensors( {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h index 8f6dcf5ff..a840c4749 100644 --- a/src/fastertransformer/utils/llama_utils.h +++ b/src/fastertransformer/utils/llama_utils.h @@ -84,9 +84,9 @@ static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, in std::cout << ind; _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1); if (i != end1 - 1) - std::cout << "\n"; + std::cout << "\n\n"; } - std::cout << "]\n"; + std::cout << "]"; } template @@ -105,7 +105,7 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int std::cout << ind; _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); if (i != dim1 - 1) - std::cout << "\n\n"; + std::cout << "\n\n\n"; } if (start1 != end1) { std::cout << ind; @@ -115,9 +115,9 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int std::cout << ind; _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); if (i != end1 - 1) - std::cout << "\n"; + std::cout << "\n\n\n"; } - std::cout << "]\n"; + std::cout << "]"; } template @@ -126,15 +126,7 @@ static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int T* out = (T*)malloc(sizeof(T) * size); cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1); - - /* - if (stride2 != dim3) { - for (int i = dim1 * dim2 * 3 * dim3 - 1 * dim3 - 8; i < dim1 * dim2 * 3 * dim3 - 1 * dim3; ++i) { - std::cout << out[i] << " "; - } - std::cout << "\n"; - } - */ + std::cout << "\n"; free(out); } @@ -151,9 +143,6 @@ print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int st T* out = (T*)malloc(sizeof(T) * size); cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1); - for (int i = dim1 * dim2 * dim3 * dim4 - 8; i < dim1 * dim2 * dim3 * dim4; ++i) { - std::cout << out[i] << " "; - } std::cout << "\n"; free(out); } From df743e0c8341eedf9876ebf67c4c5761dcabb3c4 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 25 Sep 2023 12:46:11 +0000 Subject: [PATCH 31/55] no cache version --- .../LLaMAContextAttentionLayer.cc | 161 +++++++++++------- src/fastertransformer/models/llama/LLaMA.cc | 16 +- .../models/llama/LLaMAContextDecoder.cc | 8 +- src/fastertransformer/th_op/llama/LLaMA.h | 11 +- src/fastertransformer/utils/llama_utils.h | 10 ++ src/fastertransformer/utils/memory_utils.cu | 2 +- 6 files changed, 121 insertions(+), 87 deletions(-) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 5aecc5de6..111f09740 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -68,6 +68,12 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten PUSH_RANGE("qkv_gemm"); + //std::cout << "G1====================================\n"; + //std::cout << "hidden_units_: " << hidden_units_ << "\n"; + //std::cout << "m: " << m << "\n"; + //std::cout << "G1====================================\n"; + //std::cout << std::flush; + cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, 3 * hidden_units_, // n @@ -80,62 +86,62 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); -// if (true) { -// print_tensor3(qkv_buf_, -// batch_size, -// seq_len, -// hidden_units_, -// seq_len * 3 * hidden_units_, -// 3 * hidden_units_, -// batch_size * seq_len * 3 * hidden_units_, -// 0); -// print_tensor3(qkv_buf_, -// batch_size, -// seq_len, -// hidden_units_, -// seq_len * 3 * hidden_units_, -// 3 * hidden_units_, -// batch_size * seq_len * 3 * hidden_units_, -// hidden_units_); -// print_tensor3(qkv_buf_, -// batch_size, -// seq_len, -// hidden_units_, -// seq_len * 3 * hidden_units_, -// 3 * hidden_units_, -// batch_size * seq_len * 3 * hidden_units_, -// 2*hidden_units_); -// } -// if (true) { -// print_tensor4(qkv_buf_, -// batch_size, seq_len, head_num_, size_per_head_, -// seq_len * 3 * head_num_ * size_per_head_, -// 3 * head_num_ * size_per_head_, -// size_per_head_, -// batch_size * seq_len * 3 * head_num_ * size_per_head_, -// 0 -// ); -// print_tensor4(qkv_buf_, -// batch_size, seq_len, head_num_, size_per_head_, -// seq_len * 3 * head_num_ * size_per_head_, -// 3 * head_num_ * size_per_head_, -// size_per_head_, -// batch_size * seq_len * 3 * head_num_ * size_per_head_, -// head_num_ * size_per_head_ -// ); -// print_tensor4(qkv_buf_, -// batch_size, seq_len, head_num_, size_per_head_, -// seq_len * 3 * head_num_ * size_per_head_, -// 3 * head_num_ * size_per_head_, -// size_per_head_, -// batch_size * seq_len * 3 * head_num_ * size_per_head_, -// 2 * head_num_ * size_per_head_ -// ); -// } + // if (true) { + // print_tensor3(qkv_buf_, + // batch_size, + // seq_len, + // hidden_units_, + // seq_len * 3 * hidden_units_, + // 3 * hidden_units_, + // batch_size * seq_len * 3 * hidden_units_, + // 0); + // print_tensor3(qkv_buf_, + // batch_size, + // seq_len, + // hidden_units_, + // seq_len * 3 * hidden_units_, + // 3 * hidden_units_, + // batch_size * seq_len * 3 * hidden_units_, + // hidden_units_); + // print_tensor3(qkv_buf_, + // batch_size, + // seq_len, + // hidden_units_, + // seq_len * 3 * hidden_units_, + // 3 * hidden_units_, + // batch_size * seq_len * 3 * hidden_units_, + // 2*hidden_units_); + // } + // if (true) { + // print_tensor4(qkv_buf_, + // batch_size, seq_len, head_num_, size_per_head_, + // seq_len * 3 * head_num_ * size_per_head_, + // 3 * head_num_ * size_per_head_, + // size_per_head_, + // batch_size * seq_len * 3 * head_num_ * size_per_head_, + // 0 + // ); + // print_tensor4(qkv_buf_, + // batch_size, seq_len, head_num_, size_per_head_, + // seq_len * 3 * head_num_ * size_per_head_, + // 3 * head_num_ * size_per_head_, + // size_per_head_, + // batch_size * seq_len * 3 * head_num_ * size_per_head_, + // head_num_ * size_per_head_ + // ); + // print_tensor4(qkv_buf_, + // batch_size, seq_len, head_num_, size_per_head_, + // seq_len * 3 * head_num_ * size_per_head_, + // 3 * head_num_ * size_per_head_, + // size_per_head_, + // batch_size * seq_len * 3 * head_num_ * size_per_head_, + // 2 * head_num_ * size_per_head_ + // ); + // } if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(k_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); + cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); sync_check_cuda_error(); } invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, @@ -151,12 +157,11 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten rotary_embedding_dim_, stream_); sync_check_cuda_error(); -// if (true) { -// print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); -// print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); -// print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_); -// } - + // if (true) { + // print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + // print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + // } // const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length // // Use batch major @@ -194,6 +199,15 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { PUSH_RANGE("Q*K batch gemm"); + //std::cout << "G2====================================\n"; + //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; + //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; + //std::cout << "batch_size: " << batch_size << "\n"; + //std::cout << "head_num_: " << head_num_ << "\n"; + //std::cout << "size_per_head_: " << size_per_head_ << "\n"; + //std::cout << "G2====================================\n"; + //std::cout << std::flush; + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, attention_seq_len_2, // n @@ -215,7 +229,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten attention_seq_len_2 * attention_seq_len_1, batch_size * head_num_, // global batch size CUDA_R_32F); - sync_check_cuda_error(); POP_RANGE; @@ -236,6 +249,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } else { PUSH_RANGE("Q*K batch gemm"); + //std::cout << "G2====================================\n"; + //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; + //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; + //std::cout << "batch_size: " << batch_size << "\n"; + //std::cout << "head_num_: " << head_num_ << "\n"; + //std::cout << "size_per_head_: " << size_per_head_ << "\n"; + //std::cout << "G2====================================\n"; + //std::cout << std::flush; cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, attention_seq_len_2, @@ -270,6 +291,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } PUSH_RANGE("QK*V batch gemm"); + //std::cout << "G3====================================\n"; + //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; + //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; + //std::cout << "batch_size: " << batch_size << "\n"; + //std::cout << "head_num_: " << head_num_ << "\n"; + //std::cout << "size_per_head_: " << size_per_head_ << "\n"; + //std::cout << "G3====================================\n"; + //std::cout << std::flush; cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, CUBLAS_OP_N, size_per_head_, @@ -433,26 +462,26 @@ template void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true); + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false); k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true); + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false); } else { allocator_->free((void**)(&qk_buf_)); qk_buf_ = nullptr; } - qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true); - qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true); + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false); } else { allocator_->free((void**)(&qk_buf_float_)); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index cdf8071e6..988809cfa 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -58,10 +58,8 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); - decoder_output_buf_ = - (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); @@ -84,7 +82,6 @@ void LLaMA::freeBuffer() { if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); - allocator_->free((void**)(&decoder_output_buf_)); allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&key_cache_)); @@ -205,7 +202,8 @@ void LLaMA::forward(std::unordered_map* output_ten const std::unordered_map* input_tensors, const LLaMAWeight* llama_weights) { - // Logger::getLogger().setLevel(Logger::Level::TRACE); + // Logger::getLogger().setLevel(Logger::Level::DEBUG); + // // input_tensors: // input_ids [batch_size, seq_len] // input_lengths [batch_size] @@ -265,10 +263,6 @@ void LLaMA::forward(std::unordered_map* output_ten hidden_units_, stream_); sync_check_cuda_error(); - -// std::cout << 0 << "==================" << "EMBEDDING\n"; -// print_tensor3(context_decoder_input_buf_, batch_size, seq_len, hidden_units_); -// std::cout << 0 << "==================" << "EMBEDDING\n"; } std::unordered_map decoder_input_tensors{ @@ -312,10 +306,12 @@ void LLaMA::forward(std::unordered_map* output_ten vocab_size_); sync_check_cuda_error(); + if (std::is_same::value) { float* output_logits = output_tensors->at("output_logits").getPtr(); - invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); + invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); sync_check_cuda_error(); + //print_tensor3(output_logits, batch_size, seq_len, vocab_size_); } } } diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 9ae318554..41876cc4e 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -67,7 +67,7 @@ void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); decoder_layer_output_ = reinterpret_cast( allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); + h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), false, false); padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); @@ -325,12 +325,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; - // std::cout << l << "==================" << "QBUF\n"; + //std::cout << l << "==================" << "ATTENTION\n"; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - // std::cout << l << "==================" << "QBUF\n"; - // std::cout << std::flush; + //std::cout << l << "==================" << "ATTENTION\n"; + //std::cout << std::flush; if (false) { std::cout << l << "==================" diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index ab594c5c7..9a7cb9168 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -114,16 +114,15 @@ class FTLLaMA: public IFLLaMA { delete cublas_wrapper_mutex_; } - virtual void forward(th::Tensor& output_logits, - th::Tensor& input_ids, - th::Tensor& input_lengths, - const int start_pos) override + virtual void + forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override { auto stream = at::cuda::getCurrentCUDAStream().stream(); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); cublasSetStream(cublasHandle, stream); - ft::Allocator allocator = ft::Allocator(); - ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper( + ft::Allocator allocator = + ft::Allocator(at::cuda::getCurrentCUDAStream().device_index()); + ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper( cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator); if (std::is_same::value) { diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h index a840c4749..d23f5ba8e 100644 --- a/src/fastertransformer/utils/llama_utils.h +++ b/src/fastertransformer/utils/llama_utils.h @@ -120,6 +120,16 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int std::cout << "]"; } +template +static void print_tensor1(T* in, int dim1) +{ + T* out = (T*)malloc(sizeof(T) * dim1); + cudaMemcpy(out, in, sizeof(T) * dim1, cudaMemcpyDeviceToHost); + _print_tensor1(out, dim1, 1); + std::cout << "\n"; + free(out); +} + template static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start) { diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu index d795cbf99..134224a09 100644 --- a/src/fastertransformer/utils/memory_utils.cu +++ b/src/fastertransformer/utils/memory_utils.cu @@ -177,7 +177,7 @@ __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) template void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream) { - cudaCast<<<(size + 255) / 256, 256, 0, stream>>>(dst, src, size); + cudaCast<<<256, 256, 0, stream>>>(dst, src, size); } template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream); From 220aec06594944fc787dfd170f9b4f50bfce1350 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 25 Sep 2023 12:51:11 +0000 Subject: [PATCH 32/55] no-cache version bug fix --- src/fastertransformer/models/llama/LLaMA.cc | 1 - .../models/llama/LLaMAContextDecoder.cc | 23 +------------------ 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 988809cfa..e8abe28f9 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -311,7 +311,6 @@ void LLaMA::forward(std::unordered_map* output_ten float* output_logits = output_tensors->at("output_logits").getPtr(); invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); sync_check_cuda_error(); - //print_tensor3(output_logits, batch_size, seq_len, vocab_size_); } } } diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 41876cc4e..41abb0006 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -67,7 +67,7 @@ void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); decoder_layer_output_ = reinterpret_cast( allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), false, false); + h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); @@ -288,15 +288,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - if (false) { - std::cout << l << "==================" - << "ATTN_NORM\n"; - print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_); - std::cout << l << "==================" - << "ATTN_NORM\n"; - std::cout << std::flush; - } - TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, {"attention_mask", @@ -325,21 +316,9 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; - //std::cout << l << "==================" << "ATTENTION\n"; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - //std::cout << l << "==================" << "ATTENTION\n"; - //std::cout << std::flush; - - if (false) { - std::cout << l << "==================" - << "ATTENTION\n"; - print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_); - std::cout << l << "==================" - << "ATTENTION\n"; - std::cout << std::flush; - } invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, From 4fb06e7ba2cac488f9f144acb87d3087d328e35d Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 25 Sep 2023 16:49:39 +0000 Subject: [PATCH 33/55] cache version --- src/fastertransformer/kernels/CMakeLists.txt | 4 + .../kernels/llama_kernels.cu | 56 +++++ src/fastertransformer/kernels/llama_kernels.h | 15 ++ .../kernels/unfused_attention_kernels.cu | 214 ++++++++++++++++-- .../kernels/unfused_attention_kernels.h | 26 +++ .../LLaMAContextAttentionLayer.cc | 166 ++++---------- .../LLaMAContextAttentionLayer.h | 2 +- .../models/llama/CMakeLists.txt | 3 +- src/fastertransformer/models/llama/LLaMA.cc | 33 +-- src/fastertransformer/models/llama/LLaMA.h | 21 +- .../models/llama/LLaMAContextDecoder.cc | 1 + src/fastertransformer/th_op/llama/LLaMA.h | 103 +++++---- src/fastertransformer/utils/llama_utils.h | 10 + 13 files changed, 437 insertions(+), 217 deletions(-) create mode 100644 src/fastertransformer/kernels/llama_kernels.cu create mode 100644 src/fastertransformer/kernels/llama_kernels.h diff --git a/src/fastertransformer/kernels/CMakeLists.txt b/src/fastertransformer/kernels/CMakeLists.txt index fd2a1b494..c5cc14c8e 100644 --- a/src/fastertransformer/kernels/CMakeLists.txt +++ b/src/fastertransformer/kernels/CMakeLists.txt @@ -233,3 +233,7 @@ add_library(moe_kernels STATIC moe_kernels.cu) set_property(TARGET moe_kernels PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET moe_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(moe_kernels PRIVATE moe_gemm_kernels) + +add_library(llama_kernels STATIC llama_kernels.cu) +set_property(TARGET llama_kernels PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET llama_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu new file mode 100644 index 000000000..3c753f866 --- /dev/null +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -0,0 +1,56 @@ +#include "src/fastertransformer/utils/cuda_fp8_utils.h" +#include "src/fastertransformer/kernels/llama_kernels.h" + +namespace fastertransformer { + +template +__global__ void LLaMAbuildDecoderAttentionMaskKernel( + T* attention_mask, const int* sequence_lengths, const int batch_size, const int seq_len, const int start_pos) +{ + // sequence_lengths: + // [batch_size] + // attention_mask: + // [batch_size, 1, seq_len, seq_len + start_pos] + const int max_length = seq_len + start_pos; + const int mask_size_per_seq = seq_len * max_length; + attention_mask += blockIdx.x * mask_size_per_seq; + const int seq_length = sequence_lengths[blockIdx.x]; + + for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) { + int row_id = i / max_length; + int col_id = i % max_length; + if (row_id < seq_length && col_id <= (row_id + start_pos)) { + attention_mask[i] = (T)(1.0f); + } + else { + attention_mask[i] = (T)(0.0f); + } + } +} + +template +void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int start_pos, + cudaStream_t stream) +{ + LLaMAbuildDecoderAttentionMaskKernel + <<>>(attention_mask, sequence_lengths, batch_size, seq_len, start_pos); +} + +template void invokeLLaMABuildDecoderAttentionMask(float* attention_mask, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int start_pos, + cudaStream_t stream); + +template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int start_pos, + cudaStream_t stream); +} // namespace fastertransformer diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h new file mode 100644 index 000000000..320b5624f --- /dev/null +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -0,0 +1,15 @@ +#pragma once + + +#include "src/fastertransformer/utils/cuda_fp8_utils.h" +#include "src/fastertransformer/utils/memory_utils.h" +namespace fastertransformer { + +template +void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int start_pos, + cudaStream_t stream); +} // namespace fastertransformer diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 97df58261..1010ca3f3 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1591,7 +1591,8 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, const int seq_len, const int head_num, const int size_per_head, - const int rotary_embedding_dim) + const int rotary_embedding_dim, + const int start_pos) { constexpr int vec_size = Vec_t::size; using Vec_t = typename Vec_t::Type; @@ -1610,9 +1611,9 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, const int hidden_idx = head_idx * size_per_head + tidx * vec_size; const int n = head_num * size_per_head; - const int src_q_idx = token_idx * 3 * n + hidden_idx; - const int src_k_idx = token_idx * 3 * n + hidden_idx + n; - const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; + const int src_q_idx = token_idx * 3 * n + hidden_idx; + const int src_k_idx = token_idx * 3 * n + hidden_idx + n; + const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; Vec_t q, k, v; if (!is_masked) { @@ -1621,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, v = *reinterpret_cast(&QKV[src_v_idx]); } - mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, seq_idx); + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos + seq_idx); const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + seq_idx * size_per_head + tidx * vec_size; @@ -1648,12 +1649,22 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, + const int start_pos, cudaStream_t stream) { - dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); - dim3 grid(token_num, head_num); - llama_add_fusedQKV_bias_transpose_kernel<<>>( - q_buf, k_buf, v_buf, QKV, padding_offset, batch_size, seq_len, head_num, size_per_head, rotary_embedding_dim); + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(token_num, head_num); + llama_add_fusedQKV_bias_transpose_kernel<<>>(q_buf, + k_buf, + v_buf, + QKV, + padding_offset, + batch_size, + seq_len, + head_num, + size_per_head, + rotary_embedding_dim, + start_pos); } template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, @@ -1667,6 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, + const int start_pos, cudaStream_t stream); template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, @@ -1680,6 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, + const int start_pos, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, @@ -1693,6 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, + const int start_pos, cudaStream_t stream); #endif @@ -1875,6 +1889,7 @@ void invokeTranspose4dBatchMajor(T* k_dst, const int size_per_head, \ const int local_head_num, \ cudaStream_t stream) + INSTANTIATETRANSPOSE4DBATCHMAJOR(float); INSTANTIATETRANSPOSE4DBATCHMAJOR(half); #ifdef ENABLE_BF16 @@ -1882,6 +1897,169 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16); #endif #undef INSTANTIATETRANSPOSE4DBATCHMAJOR +template +__global__ void transpose_4d_save_to_cache(T* k_dst, + const T* k_src, + T* v_dst, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + const int start_pos) +{ + // [batch_size, head_num, seq_len, size_per_head] + const int batch_id = blockIdx.y; + const int head_id = blockIdx.z; + + // 16 byte loads will handle "x" dimension + auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * seq_len + + head_id * size_per_head * seq_len); + auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + + start_pos * size_per_head + ); + auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * seq_len + + head_id * size_per_head * seq_len); + auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + + start_pos * size_per_head + ); + + // idx is over output dimension L * size_per_head / x for values + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + const int size_per_head_div_x = size_per_head / X_ELEMS; + + if (idx >= size_per_head_div_x * seq_len) { + return; + } + + key_dst[idx] = key_src[idx]; + val_dst[idx] = val_src[idx]; +} + +template +void invokeLLaMASaveToCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int local_batch_size, + const int seq_len, + const int max_seq_len, + const int size_per_head, + const int local_head_num, + const int start_pos, + cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + dim3 grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + + transpose_4d_save_to_cache<<>>( + k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos); +} + +#define INSTANTIATESAVETOCACHE(T) \ + template void invokeLLaMASaveToCache(T* k_dst, \ + T* v_dst, \ + const T* k_src, \ + const T* v_src, \ + const int local_batch_size, \ + const int seq_len, \ + const int max_seq_len, \ + const int size_per_head, \ + const int local_head_num, \ + const int start_pos, \ + cudaStream_t stream) +INSTANTIATESAVETOCACHE(float); +INSTANTIATESAVETOCACHE(half); +#ifdef ENABLE_BF16 +INSTANTIATESAVETOCACHE(__nv_bfloat16); +#endif +#undef INSTANTIATESAVETOCACHE + +template +__global__ void transpose_4d_load_from_cache(T* k_dst, + const T* k_src, + T* v_dst, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + const int start_pos) +{ + // [batch_size, head_num, start_pos+seq_len, size_per_head] + const int batch_id = blockIdx.y; + const int head_id = blockIdx.z; + const int real_seq_len = start_pos + seq_len; + + // 16 byte loads will handle "x" dimension + auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len); + auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * real_seq_len + + head_id * size_per_head * real_seq_len); + auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len); + auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * real_seq_len + + head_id * size_per_head * real_seq_len); + + // idx is over output dimension L * size_per_head / x for values + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + const int size_per_head_div_x = size_per_head / X_ELEMS; + + if (idx >= size_per_head_div_x * real_seq_len) { + return; + } + + key_dst[idx] = key_src[idx]; + val_dst[idx] = val_src[idx]; +} + +template +void invokeLLaMALoadFromCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int local_batch_size, + const int seq_len, + const int max_seq_len, + const int size_per_head, + const int local_head_num, + const int start_pos, + cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + dim3 grid(((start_pos + seq_len) * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + + transpose_4d_load_from_cache<<>>( + k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos); +} + +#define INSTANTIATELOADFROMCACHE(T) \ + template void invokeLLaMALoadFromCache(T* k_dst, \ + T* v_dst, \ + const T* k_src, \ + const T* v_src, \ + const int local_batch_size, \ + const int seq_len, \ + const int max_seq_len, \ + const int size_per_head, \ + const int local_head_num, \ + const int start_pos, \ + cudaStream_t stream) +INSTANTIATELOADFROMCACHE(float); +INSTANTIATELOADFROMCACHE(half); +#ifdef ENABLE_BF16 +INSTANTIATELOADFROMCACHE(__nv_bfloat16); +#endif +#undef INSTANTIATELOADFROMCACHE + template __global__ void addRelativeAttentionBias( T* qk_buf, const T* relative_attention_bias, const int batch_size, const int head_num, const int seq_len) @@ -1942,8 +2120,8 @@ INSTANTIATEADDRELATIVEATTENTIONBIAS(__nv_bfloat16); // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, 3*batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, 3*batch); // block(num_head * size_per_head) template __global__ void add_head3Size_QKV_bias(const T* mm_qkv, @@ -1993,8 +2171,8 @@ __global__ void add_head3Size_QKV_bias(const T* mm_qkv, // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, 3*batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, 3*batch); // block(num_head * size_per_head) template<> __global__ void add_head3Size_QKV_bias(const float2* mm_qkv, @@ -2046,8 +2224,8 @@ __global__ void add_head3Size_QKV_bias(const float2* mm_qkv, // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, batch); // block(num_head * size_per_head) template<> __global__ void add_head3Size_QKV_bias(const half2* mm_qkv, @@ -2237,7 +2415,8 @@ INSTANTIATEADDHEAD3SIZEQKVBIAS(__nv_bfloat16); #endif #undef INSTANTIATEADDHEAD3SIZEQKVBIAS -/******************* invokeMaskedSoftMaxWithRelPosBias ***********************/ +/******************* invokeMaskedSoftMaxWithRelPosBias + * ***********************/ // grid = (window_len/word_per_thread, window_num*num_head, batch_size) // block.x = max(32, (window_len + 31)/32*32) @@ -2586,7 +2765,8 @@ __global__ void transpose_attentions( // attentions_in shape [B, H, S, S] // attentions_out shape [B, L, H, S, S]. // Note that we write the L dimension as if it was index 0. - // In reality, the pointer has already been shifted to point to the correct layer. + // In reality, the pointer has already been shifted to point to the + // correct layer. const auto batch_idx = blockIdx.x; const auto head_idx = blockIdx.y; diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 0ccf64d8c..2d4b01dde 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -125,6 +125,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, + const int start_pos, cudaStream_t stream); template @@ -203,6 +204,31 @@ void invokeTranspose4dBatchMajor(T* k_dst, const int local_head_num, cudaStream_t stream); +template +void invokeLLaMASaveToCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int local_batch_size, + const int seq_len, + const int max_seq_len, + const int size_per_head, + const int local_head_num, + const int start_pos, + cudaStream_t stream); +template +void invokeLLaMALoadFromCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int local_batch_size, + const int seq_len, + const int max_seq_len, + const int size_per_head, + const int local_head_num, + const int start_pos, + cudaStream_t stream); + template void invokeAddRelativeAttentionBias(T* qk_buf, const T* relative_attention_bias, diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 111f09740..a9989543a 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -39,13 +39,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // output_tensors: // hidden_features [token_num, hidden_dimension] - // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] + // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - FT_CHECK(output_tensors->at("key_cache").shape.size() == 5); + FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); const int batch_size = input_tensors->at("attention_mask").shape[0]; const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); const int layer_id = input_tensors->getVal("layer_id"); const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); @@ -60,7 +61,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten "LLaMA Context FUSED_PADDED_MHA is not supported !"); PUSH_RANGE("attention buffer alloc"); - allocateBuffer(batch_size, seq_len, attention_type != AttentionType::FUSED_MHA); + allocateBuffer(batch_size, seq_len, max_seq_len, attention_type != AttentionType::FUSED_MHA); POP_RANGE; sync_check_cuda_error(); @@ -68,12 +69,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten PUSH_RANGE("qkv_gemm"); - //std::cout << "G1====================================\n"; - //std::cout << "hidden_units_: " << hidden_units_ << "\n"; - //std::cout << "m: " << m << "\n"; - //std::cout << "G1====================================\n"; - //std::cout << std::flush; - cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, 3 * hidden_units_, // n @@ -86,58 +81,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten qkv_buf_, 3 * hidden_units_ /* n */); sync_check_cuda_error(); - // if (true) { - // print_tensor3(qkv_buf_, - // batch_size, - // seq_len, - // hidden_units_, - // seq_len * 3 * hidden_units_, - // 3 * hidden_units_, - // batch_size * seq_len * 3 * hidden_units_, - // 0); - // print_tensor3(qkv_buf_, - // batch_size, - // seq_len, - // hidden_units_, - // seq_len * 3 * hidden_units_, - // 3 * hidden_units_, - // batch_size * seq_len * 3 * hidden_units_, - // hidden_units_); - // print_tensor3(qkv_buf_, - // batch_size, - // seq_len, - // hidden_units_, - // seq_len * 3 * hidden_units_, - // 3 * hidden_units_, - // batch_size * seq_len * 3 * hidden_units_, - // 2*hidden_units_); - // } - // if (true) { - // print_tensor4(qkv_buf_, - // batch_size, seq_len, head_num_, size_per_head_, - // seq_len * 3 * head_num_ * size_per_head_, - // 3 * head_num_ * size_per_head_, - // size_per_head_, - // batch_size * seq_len * 3 * head_num_ * size_per_head_, - // 0 - // ); - // print_tensor4(qkv_buf_, - // batch_size, seq_len, head_num_, size_per_head_, - // seq_len * 3 * head_num_ * size_per_head_, - // 3 * head_num_ * size_per_head_, - // size_per_head_, - // batch_size * seq_len * 3 * head_num_ * size_per_head_, - // head_num_ * size_per_head_ - // ); - // print_tensor4(qkv_buf_, - // batch_size, seq_len, head_num_, size_per_head_, - // seq_len * 3 * head_num_ * size_per_head_, - // 3 * head_num_ * size_per_head_, - // size_per_head_, - // batch_size * seq_len * 3 * head_num_ * size_per_head_, - // 2 * head_num_ * size_per_head_ - // ); - // } if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous @@ -155,34 +98,39 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten head_num_, size_per_head_, rotary_embedding_dim_, + start_pos, stream_); sync_check_cuda_error(); - // if (true) { - // print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - // print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - // } - - // const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]); // max output seq length - // // Use batch major - // // put k/v_buf from shape [B, H, L, Dh] - // // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] - // // TODO: Cache implementation - // // k_cache: [batch_size, num_heads, L, Dh] - // // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh] - // // v_buf: [batch_size, num_heads, L, Dh] - // invokeTranspose4dBatchMajor(output_tensors->getPtr("key_cache"), - // output_tensors->getPtr("value_cache"), - // k_buf_2_, - // v_buf_2_, - // batch_size, - // seq_len, - // max_seq_len, - // size_per_head_, - // head_num_, - // stream_); - // sync_check_cuda_error(); - // POP_RANGE; + + // key_cache [batch, local_head_num, max_seq_len, size_per_head] + // value_cache [batch, local_head_num, max_seq_len, size_per_head] + T* key_cache = output_tensors->getPtr("key_cache"); + T* value_cache = output_tensors->getPtr("value_cache"); + invokeLLaMASaveToCache(key_cache, + value_cache, + k_buf_2_, + v_buf_2_, + batch_size, + seq_len, + max_seq_len, + size_per_head_, + head_num_, + start_pos, + stream_); + sync_check_cuda_error(); + POP_RANGE; + + invokeLLaMALoadFromCache(k_buf_2_, + v_buf_2_, + key_cache, + value_cache, + batch_size, + seq_len, + max_seq_len, + size_per_head_, + head_num_, + start_pos, + stream_); if (attention_type == AttentionType::FUSED_MHA) { dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); @@ -190,8 +138,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } else { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = seq_len; // q length - const int attention_seq_len_2 = seq_len; // kv length + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = start_pos + seq_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); // @@ -199,14 +147,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { PUSH_RANGE("Q*K batch gemm"); - //std::cout << "G2====================================\n"; - //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; - //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; - //std::cout << "batch_size: " << batch_size << "\n"; - //std::cout << "head_num_: " << head_num_ << "\n"; - //std::cout << "size_per_head_: " << size_per_head_ << "\n"; - //std::cout << "G2====================================\n"; - //std::cout << std::flush; cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, @@ -249,14 +189,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } else { PUSH_RANGE("Q*K batch gemm"); - //std::cout << "G2====================================\n"; - //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; - //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; - //std::cout << "batch_size: " << batch_size << "\n"; - //std::cout << "head_num_: " << head_num_ << "\n"; - //std::cout << "size_per_head_: " << size_per_head_ << "\n"; - //std::cout << "G2====================================\n"; - //std::cout << std::flush; cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, attention_seq_len_2, @@ -291,14 +223,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } PUSH_RANGE("QK*V batch gemm"); - //std::cout << "G3====================================\n"; - //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n"; - //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n"; - //std::cout << "batch_size: " << batch_size << "\n"; - //std::cout << "head_num_: " << head_num_ << "\n"; - //std::cout << "size_per_head_: " << size_per_head_ << "\n"; - //std::cout << "G3====================================\n"; - //std::cout << std::flush; cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, CUBLAS_OP_N, size_per_head_, @@ -459,17 +383,21 @@ void LLaMAContextAttentionLayer::allocateBuffer() } template -void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf) +void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, + size_t seq_len, + size_t max_seq_len, + bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false); - k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; - v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_; + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * max_seq_len * 3 * hidden_units_, false); + k_buf_2_ = q_buf_2_ + batch_size * max_seq_len * hidden_units_; + v_buf_2_ = k_buf_2_ + batch_size * max_seq_len * hidden_units_; // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false); + qk_buf_ = + (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * max_seq_len * max_seq_len, false); } else { allocator_->free((void**)(&qk_buf_)); @@ -481,7 +409,7 @@ void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * max_seq_len * max_seq_len, false); } else { allocator_->free((void**)(&qk_buf_float_)); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 85fd74af8..7300186ba 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -38,7 +38,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { std::unique_ptr dispatcher_fp16; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf); + void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len, bool allocate_qk_buf); void freeBuffer() override; using BaseAttentionLayer::is_free_buffer_after_forward_; diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt index 0c5106f00..24acf1d78 100644 --- a/src/fastertransformer/models/llama/CMakeLists.txt +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -27,7 +27,7 @@ target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper FfnLayer layernorm_kernels add_residual_kernels - gpt_kernels + llama_kernels tensor nccl_utils cuda_utils @@ -45,6 +45,7 @@ target_link_libraries(LLaMA PUBLIC -lcudart LLaMAContextDecoder decoding_kernels gpt_kernels + llama_kernels BaseBeamSearchLayer bert_preprocess_kernels tensor diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index e8abe28f9..29caa3722 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -17,10 +17,10 @@ #include "src/fastertransformer/models/llama/LLaMA.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h" -#include "src/fastertransformer/kernels/gpt_kernels.h" +#include "src/fastertransformer/kernels/llama_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" -#include "src/fastertransformer/utils/memory_utils.h" #include "src/fastertransformer/utils/llama_utils.h" +#include "src/fastertransformer/utils/memory_utils.h" #include #include @@ -57,7 +57,7 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_; input_attention_mask_ = - (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); + (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false)); normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); @@ -85,9 +85,6 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&key_cache_)); - if (cache_indirections_[0] != nullptr) { - allocator_->free((void**)(&cache_indirections_)[0]); - } allocator_->free((void**)(&tiled_input_ids_buf_)); allocator_->free((void**)(&tiled_input_lengths_buf_)); @@ -217,11 +214,7 @@ void LLaMA::forward(std::unordered_map* output_ten FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); const size_t batch_size = input_tensors->at("input_ids").shape[0]; - - // NOTE: Prefix Prompt PreProcessing - // get prefix_prompt_weight for each batch --> shape [batch, 1] - // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] - int seq_len = input_tensors->at("input_ids").shape[1]; + int seq_len = input_tensors->at("input_ids").shape[1]; // max cache seq len should include max prefix prompt length as it has k/v states const int start_pos = input_tensors->at("start_pos").max(); @@ -231,12 +224,8 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); const DataType data_type = getTensorType(); - const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, - batch_size, - head_num_, - size_per_head_ / (16 / sizeof(T)), - max_seq_len_, - 16 / sizeof(T)}; + const std::vector self_k_cache_shape = { + num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}; const std::vector self_v_cache_shape = { num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}; @@ -250,8 +239,8 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); - invokeBuildDecoderAttentionMask( - input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, seq_len, 0, stream_); + invokeLLaMABuildDecoderAttentionMask( + input_attention_mask_, tiled_input_lengths_buf_, batch_size, seq_len, start_pos, stream_); sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { @@ -269,7 +258,10 @@ void LLaMA::forward(std::unordered_map* output_ten {"decoder_input", Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}}, {"attention_mask", - Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}}, + Tensor{MEMORY_GPU, + data_type, + {batch_size, 1, (size_t)seq_len, (size_t)(start_pos + seq_len)}, + input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}, {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}}; @@ -306,7 +298,6 @@ void LLaMA::forward(std::unordered_map* output_ten vocab_size_); sync_check_cuda_error(); - if (std::is_same::value) { float* output_logits = output_tensors->at("output_logits").getPtr(); invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index dab7a0509..52f969a74 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -59,21 +59,20 @@ class LLaMA: public BaseLayer { void initialize(); protected: - T* input_attention_mask_; - T* decoder_output_buf_; - T* normed_decoder_output_buf_; + T* input_attention_mask_ = nullptr; + T* decoder_output_buf_ = nullptr; + T* normed_decoder_output_buf_ = nullptr; - T* logits_buf_; + T* logits_buf_ = nullptr; - T* key_cache_; - T* value_cache_; - int* cache_indirections_[2] = {nullptr, nullptr}; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; - int* tiled_input_ids_buf_; - int* tiled_input_lengths_buf_; + int* tiled_input_ids_buf_ = nullptr; + int* tiled_input_lengths_buf_ = nullptr; - T* context_decoder_input_buf_; - T* context_decoder_output_buf_; + T* context_decoder_input_buf_ = nullptr; + T* context_decoder_output_buf_ = nullptr; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 41abb0006..0f99e0887 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -215,6 +215,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* const int start_pos = input_tensors->at("start_pos").max(); const DataType data_type = getTensorType(); allocateBuffer(batch_size, seq_len); + sync_check_cuda_error(); T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 9a7cb9168..7595a2a88 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -103,38 +103,24 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); - } - - ~FTLLaMA() override - { - ft::ftNcclParamDestroy(tensor_para_); - ft::ftNcclParamDestroy(pipeline_para_); - cublasLtDestroy(cublasltHandle_); - delete cublas_algo_map_; - delete cublas_wrapper_mutex_; - } - virtual void - forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override - { auto stream = at::cuda::getCurrentCUDAStream().stream(); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); cublasSetStream(cublasHandle, stream); - ft::Allocator allocator = - ft::Allocator(at::cuda::getCurrentCUDAStream().device_index()); - ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper( - cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator); + + /// ft::Allocator allocator = + // ft::Allocator(at::cuda::getCurrentCUDAStream().device_index()); + allocator_ = new ft::Allocator(); + cublas_wrapper_ = new ft::cublasMMWrapper( + cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, allocator_); if (std::is_same::value) { - cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); } else if (std::is_same::value) { - cublas_wrapper.setFP32GemmConfig(); + cublas_wrapper_->setFP32GemmConfig(); } - const size_t request_batch_size = (size_t)input_ids.size(0); - const size_t seq_len = (size_t)input_ids.size(1); - ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, ft::getSMVersion(), true, // remove_padding @@ -142,44 +128,63 @@ class FTLLaMA: public IFLLaMA { true, // is_fuse false, // with_relative_position_bias true); // causal_mask - - ft::LLaMA llama = ft::LLaMA(num_heads_, - size_per_head_, - inter_size_, - num_layers_, - vocab_size_, - rotary_embedding_dim_, - random_seed_, - max_seq_len_, - tensor_para_, - pipeline_para_, - stream, - &cublas_wrapper, - &allocator, - false, // is_free_buffer_after_forward - &prop_, // cuda_device_prop - attention_type // attention_type + // + llama_ = new ft::LLaMA(num_heads_, + size_per_head_, + inter_size_, + num_layers_, + vocab_size_, + rotary_embedding_dim_, + random_seed_, + max_seq_len_, + tensor_para_, + pipeline_para_, + stream, + cublas_wrapper_, + allocator_, + false, // is_free_buffer_after_forward + &prop_, // cuda_device_prop + attention_type // attention_type ); + } + + ~FTLLaMA() override + { + delete llama_; + delete cublas_wrapper_; + delete allocator_; + + ft::ftNcclParamDestroy(tensor_para_); + ft::ftNcclParamDestroy(pipeline_para_); + cublasLtDestroy(cublasltHandle_); + delete cublas_algo_map_; + delete cublas_wrapper_mutex_; + } + + virtual void + forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override + { + + const size_t batch_size = (size_t)input_ids.size(0); + const size_t seq_len = (size_t)input_ids.size(1); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_INT32, - std::vector{request_batch_size, seq_len}, - get_ptr(input_ids)}}, - {"input_lengths", ft::Tensor{ - ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{request_batch_size}, get_ptr(input_lengths)}}, + ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size, seq_len}, get_ptr(input_ids)}}, + {"input_lengths", + ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &start_pos}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_logits", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, - std::vector{request_batch_size, seq_len, vocab_size_}, + std::vector{batch_size, seq_len, vocab_size_}, get_ptr(output_logits)}}}; + try { - llama.forward(&output_tensors, &input_tensors, &llama_weights_); + llama_->forward(&output_tensors, &input_tensors, &llama_weights_); } catch (std::runtime_error& error) { std::cout << error.what(); @@ -212,6 +217,10 @@ class FTLLaMA: public IFLLaMA { ft::NcclParam tensor_para_; ft::NcclParam pipeline_para_; + + ft::cublasMMWrapper* cublas_wrapper_; + ft::IAllocator* allocator_; + ft::LLaMA* llama_ = nullptr; }; class LLaMA: public th::jit::CustomClassHolder { diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h index d23f5ba8e..962a47764 100644 --- a/src/fastertransformer/utils/llama_utils.h +++ b/src/fastertransformer/utils/llama_utils.h @@ -130,6 +130,16 @@ static void print_tensor1(T* in, int dim1) free(out); } +template +static void print_tensor2(T* in, int dim1, int dim2) +{ + T* out = (T*)malloc(sizeof(T) * dim1 * dim2); + cudaMemcpy(out, in, sizeof(T) * dim1 * dim2, cudaMemcpyDeviceToHost); + _print_tensor2(out, dim1, dim2, dim2, 1); + std::cout << "\n"; + free(out); +} + template static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start) { From 857d956ccc142117aada942fde111d1a6024a401 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 25 Sep 2023 17:34:51 +0000 Subject: [PATCH 34/55] remove logging --- src/fastertransformer/models/llama/LLaMA.cc | 1 - src/fastertransformer/th_op/llama/LLaMA.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 29caa3722..f626e9a95 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -199,7 +199,6 @@ void LLaMA::forward(std::unordered_map* output_ten const std::unordered_map* input_tensors, const LLaMAWeight* llama_weights) { - // Logger::getLogger().setLevel(Logger::Level::DEBUG); // // input_tensors: // input_ids [batch_size, seq_len] diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 7595a2a88..bf41aa630 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -59,6 +59,8 @@ class FTLLaMA: public IFLLaMA { pipeline_para_size_(pipeline_para_size), weights_(weights) { + ft::Logger::getLogger().setLevel(ft::Logger::WARNING); + ft::check_cuda_error(cublasLtCreate(&cublasltHandle_)); cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); cublas_wrapper_mutex_ = new std::mutex(); From 3074afaaaabc559c4038854ab85f23ca7274bc12 Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 26 Sep 2023 03:42:00 +0000 Subject: [PATCH 35/55] remove README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 50f50cab2..72735e507 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Check out FasterTransformer [README.md](FasterTransformerReadME.md) mkdir -p FasterTransformer/build cd FasterTransformer/build git submodule init && git submodule update -cmake -DSM=xx -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON . +cmake -DSM=70 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. make -j32 ``` From 80920075a5cecd68255dfd76522da34945f9d365 Mon Sep 17 00:00:00 2001 From: dypshong Date: Tue, 26 Sep 2023 17:47:12 +0000 Subject: [PATCH 36/55] overlap --- .../models/llama/LLaMAContextDecoder.cc | 40 ++++++++++++++----- .../models/llama/LLaMAContextDecoder.h | 33 ++++++++------- src/fastertransformer/th_op/llama/LLaMA.h | 20 ++++++++-- 3 files changed, 66 insertions(+), 27 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 0f99e0887..c4ed10752 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -27,6 +27,9 @@ namespace fastertransformer { template void LLaMAContextDecoder::initialize() { + check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking)); + check_cuda_error(cudaEventCreate(&kern_event_)); + check_cuda_error(cudaEventCreate(&comm_event_)); self_attention_layer_ = new LLaMAContextAttentionLayer(head_num_, size_per_head_, head_num_, @@ -59,7 +62,7 @@ void LLaMAContextDecoder::allocateBuffer() } template -void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -67,6 +70,10 @@ void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); decoder_layer_output_ = reinterpret_cast( allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + if (layer_output_buffer_ == nullptr) { + layer_output_buffer_ = reinterpret_cast( + allocator_->reMalloc(layer_output_buffer_, sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); + } h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); @@ -166,6 +173,10 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode template LLaMAContextDecoder::~LLaMAContextDecoder() { + check_cuda_error(cudaEventDestroy(kern_event_)); + check_cuda_error(cudaEventDestroy(comm_event_)); + check_cuda_error(cudaStreamDestroy(comm_stream_)); + delete self_attention_layer_; delete ffn_layer_; freeBuffer(); @@ -200,8 +211,8 @@ void LLaMAContextDecoder::forward(std::unordered_map* // output tensors: // decoder_output [batch_size, seq_len, hidden_dimension], - // key_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head] - // value_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head] + // key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] + // value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head] // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'. // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during @@ -210,11 +221,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* FT_CHECK(input_tensors->size() == 4); FT_CHECK(output_tensors->size() == 3); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; - const int start_pos = input_tensors->at("start_pos").max(); - const DataType data_type = getTensorType(); - allocateBuffer(batch_size, seq_len); + const int batch_size = input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int start_pos = input_tensors->at("start_pos").max(); + const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; + const DataType data_type = getTensorType(); + allocateBuffer(batch_size, seq_len, max_seq_len); sync_check_cuda_error(); T* decoder_input = input_tensors->at("decoder_input").getPtr(); @@ -257,6 +269,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { + check_cuda_error(cudaEventSynchronize(kern_event_)); invokeRemovePadding( decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); @@ -354,8 +367,17 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 && pipeline_para_.world_size_ > 1) { int data_size = h_token_num * hidden_units_; - ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_); + check_cuda_error(cudaEventSynchronize(comm_event_)); + check_cuda_error(cudaMemcpyAsync( + layer_output_buffer_, layer_output, sizeof(T) * data_size, cudaMemcpyDeviceToDevice, stream_)); + check_cuda_error(cudaEventRecord(kern_event_, stream_)); + check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_)); + ftNcclSend(layer_output_buffer_, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_); sync_check_cuda_error(); + check_cuda_error(cudaEventRecord(comm_event_, comm_stream_)); + + //ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_); + //sync_check_cuda_error(); } if ((l == num_layer_ - 1) && is_unpadded_mha) { diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index cb6736f02..7a4866ddc 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -43,6 +43,10 @@ class LLaMAContextDecoder: public BaseLayer { size_t rotary_embedding_dim_; float layernorm_eps_; + cudaEvent_t kern_event_; + cudaEvent_t comm_event_; + cudaStream_t comm_stream_; + // calculated data size_t hidden_units_; @@ -56,7 +60,7 @@ class LLaMAContextDecoder: public BaseLayer { FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -67,6 +71,7 @@ class LLaMAContextDecoder: public BaseLayer { void initialize(); protected: + T* layer_output_buffer_ = nullptr; T* decoder_normed_input_ = nullptr; T* self_attn_output_ = nullptr; T* decoder_layer_output_ = nullptr; @@ -75,19 +80,19 @@ class LLaMAContextDecoder: public BaseLayer { int* cu_seqlens_ = nullptr; public: - LLaMAContextDecoder(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t rotary_embedding_dim, - float layernorm_eps, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float, - AttentionType attention_type = AttentionType::FUSED_MHA); + LLaMAContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + float layernorm_eps, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type = AttentionType::FUSED_MHA); LLaMAContextDecoder(LLaMAContextDecoder const& decoder); diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index bf41aa630..597279b92 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -105,16 +105,17 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); + ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + ft::check_cuda_error(cudaEventCreate(&event_)); - auto stream = at::cuda::getCurrentCUDAStream().stream(); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); - cublasSetStream(cublasHandle, stream); + cublasSetStream(cublasHandle, stream_); /// ft::Allocator allocator = // ft::Allocator(at::cuda::getCurrentCUDAStream().device_index()); allocator_ = new ft::Allocator(); cublas_wrapper_ = new ft::cublasMMWrapper( - cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, allocator_); + cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_); if (std::is_same::value) { cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); @@ -141,7 +142,7 @@ class FTLLaMA: public IFLLaMA { max_seq_len_, tensor_para_, pipeline_para_, - stream, + stream_, cublas_wrapper_, allocator_, false, // is_free_buffer_after_forward @@ -152,6 +153,9 @@ class FTLLaMA: public IFLLaMA { ~FTLLaMA() override { + ft::check_cuda_error(cudaEventDestroy(event_)); + ft::check_cuda_error(cudaStreamDestroy(stream_)); + delete llama_; delete cublas_wrapper_; delete allocator_; @@ -186,7 +190,12 @@ class FTLLaMA: public IFLLaMA { get_ptr(output_logits)}}}; try { + ft::check_cuda_error(cudaEventSynchronize(event_)); llama_->forward(&output_tensors, &input_tensors, &llama_weights_); + ft::check_cuda_error(cudaEventRecord(event_, stream_)); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + ft::check_cuda_error(cudaStreamWaitEvent(stream, event_)); } catch (std::runtime_error& error) { std::cout << error.what(); @@ -210,6 +219,9 @@ class FTLLaMA: public IFLLaMA { int64_t tensor_para_size_; int64_t pipeline_para_size_; + cudaStream_t stream_; + cudaEvent_t event_; + std::vector weights_; cublasLtHandle_t cublasltHandle_; std::mutex* cublas_wrapper_mutex_; From 1187340ad4b78cd4b9d5d4300f96d7812acbbcd6 Mon Sep 17 00:00:00 2001 From: dypshong Date: Wed, 27 Sep 2023 15:36:49 +0000 Subject: [PATCH 37/55] overlapping versino --- src/fastertransformer/models/llama/LLaMAContextDecoder.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index c4ed10752..382999c37 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -269,7 +269,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { - check_cuda_error(cudaEventSynchronize(kern_event_)); invokeRemovePadding( decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); From 949c4e7737f412b9419fae8d87b4c4b79f3fca04 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 29 Sep 2023 00:34:43 +0000 Subject: [PATCH 38/55] start_pos for each sample --- .../kernels/llama_kernels.cu | 107 ++++++++++++--- src/fastertransformer/kernels/llama_kernels.h | 18 ++- .../kernels/unfused_attention_kernels.cu | 87 ++++++------- .../kernels/unfused_attention_kernels.h | 6 +- .../LLaMAContextAttentionLayer.cc | 32 +++-- src/fastertransformer/models/llama/LLaMA.cc | 97 +++++++------- src/fastertransformer/models/llama/LLaMA.h | 18 +-- .../models/llama/LLaMAContextDecoder.cc | 123 +++++++----------- .../models/llama/LLaMAContextDecoder.h | 17 +-- src/fastertransformer/th_op/llama/LLaMA.cc | 19 +-- src/fastertransformer/th_op/llama/LLaMA.h | 30 ++++- src/fastertransformer/utils/llama_utils.h | 14 +- 12 files changed, 320 insertions(+), 248 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 3c753f866..5379eda1d 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -1,25 +1,64 @@ -#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/utils/cuda_fp8_utils.h" + +#include +#include +#include namespace fastertransformer { +__global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel( + int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len) +{ + // do cumulated sum + int total_seq_len = 0; + int cum_offset = 0; + int index = 0; + for (int i = 0; i < batch_size; i++) { + const int num_tokens = sequence_length[i]; + cu_seqlens[i] = total_seq_len; + for (int j = 0; j < num_tokens; j++) { + padding_offset[index] = cum_offset; + index++; + } + cum_offset += seq_len - num_tokens; + total_seq_len += num_tokens; + } + cu_seqlens[batch_size] = total_seq_len; +} + +void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int* padding_offset, + int* cu_seqlens, + const int* input_lengths, + const int batch_size, + const int seq_len, + cudaStream_t stream) +{ + LLaMAgetPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>( + padding_offset, cu_seqlens, input_lengths, batch_size, seq_len); +} + template -__global__ void LLaMAbuildDecoderAttentionMaskKernel( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int seq_len, const int start_pos) +__global__ void LLaMAbuildDecoderAttentionMaskKernel(T* attention_mask, + const int* sequence_lengths, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int max_length) { - // sequence_lengths: - // [batch_size] // attention_mask: - // [batch_size, 1, seq_len, seq_len + start_pos] - const int max_length = seq_len + start_pos; + // [batch_size, 1, seq_len, max_length] + const int batch_idx = blockIdx.x; const int mask_size_per_seq = seq_len * max_length; - attention_mask += blockIdx.x * mask_size_per_seq; - const int seq_length = sequence_lengths[blockIdx.x]; + attention_mask += batch_idx * mask_size_per_seq; + const int context_length = context_lengths[batch_idx]; + const int length = sequence_lengths[batch_idx]; + const int offset = max_length - length; for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) { int row_id = i / max_length; int col_id = i % max_length; - if (row_id < seq_length && col_id <= (row_id + start_pos)) { + if (row_id < length && col_id <= (row_id + context_length)) { attention_mask[i] = (T)(1.0f); } else { @@ -30,27 +69,59 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel( template void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, - const int* sequence_lengths, + const int* sequence_length, + const int* context_lengths, const int batch_size, const int seq_len, - const int start_pos, + const int max_length, cudaStream_t stream) { - LLaMAbuildDecoderAttentionMaskKernel - <<>>(attention_mask, sequence_lengths, batch_size, seq_len, start_pos); + LLaMAbuildDecoderAttentionMaskKernel<<>>( + attention_mask, sequence_length, context_lengths, batch_size, seq_len, max_length); } template void invokeLLaMABuildDecoderAttentionMask(float* attention_mask, - const int* sequence_lengths, + const int* sequence_length, + const int* context_lengths, const int batch_size, const int seq_len, - const int start_pos, + const int max_length, cudaStream_t stream); template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, - const int* sequence_lengths, + const int* sequence_length, + const int* context_lengths, const int batch_size, const int seq_len, - const int start_pos, + const int max_length, cudaStream_t stream); + +template +__global__ void LLaMACopyKernel(T* dst, T* src, const int count) +{ + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + if (idx * X_ELEMS >= count) { + return; + } + + auto v_dst = reinterpret_cast(dst); + auto v_src = reinterpret_cast(src); + v_dst[idx] = v_src[idx]; +} + +template +void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + assert(count % x == 0); + int grid_sz = (count / x + block_sz - 1) / block_sz; + LLaMACopyKernel<<>>(dst, src, count); +} + +template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream); +template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream); + } // namespace fastertransformer diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index 320b5624f..a218b40d1 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -1,15 +1,25 @@ #pragma once - #include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/fastertransformer/utils/memory_utils.h" namespace fastertransformer { +void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int* padding_offset, + int* cu_seqlens, + const int* input_lengths, + const int batch_size, + const int seq_len, + cudaStream_t stream); + template void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, - const int* sequence_lengths, + const int* sequence_length, + const int* context_lengths, const int batch_size, const int seq_len, - const int start_pos, + const int max_length, cudaStream_t stream); -} // namespace fastertransformer + +template +void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); +} // namespace fastertransformer diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 1010ca3f3..134d63921 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1592,7 +1592,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos) + const int* start_pos) { constexpr int vec_size = Vec_t::size; using Vec_t = typename Vec_t::Type; @@ -1622,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, v = *reinterpret_cast(&QKV[src_v_idx]); } - mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos + seq_idx); + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos[batch_idx] + seq_idx); const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + seq_idx * size_per_head + tidx * vec_size; @@ -1649,7 +1649,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos, + const int* start_pos, cudaStream_t stream) { dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -1678,7 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos, + const int* start_pos, cudaStream_t stream); template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, @@ -1692,7 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos, + const int* start_pos, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, @@ -1706,7 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos, + const int* start_pos, cudaStream_t stream); #endif @@ -1898,15 +1898,15 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16); #undef INSTANTIATETRANSPOSE4DBATCHMAJOR template -__global__ void transpose_4d_save_to_cache(T* k_dst, - const T* k_src, - T* v_dst, - const T* v_src, - const int head_num, - const int size_per_head, - const int seq_len, - const int max_seq_len, - const int start_pos) +__global__ void transpose_4d_save_to_cache(T* k_dst, + const T* k_src, + T* v_dst, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + const int* start_pos) { // [batch_size, head_num, seq_len, size_per_head] const int batch_id = blockIdx.y; @@ -1915,16 +1915,14 @@ __global__ void transpose_4d_save_to_cache(T* k_dst, // 16 byte loads will handle "x" dimension auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * seq_len + head_id * size_per_head * seq_len); - auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_seq_len - + head_id * size_per_head * max_seq_len - + start_pos * size_per_head - ); + auto key_dst = + reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head); auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * seq_len + head_id * size_per_head * seq_len); - auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_seq_len - + head_id * size_per_head * max_seq_len - + start_pos * size_per_head - ); + auto val_dst = + reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head); // idx is over output dimension L * size_per_head / x for values const int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -1950,7 +1948,7 @@ void invokeLLaMASaveToCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int start_pos, + const int* start_pos, cudaStream_t stream) { constexpr int block_sz = 128; @@ -1971,7 +1969,7 @@ void invokeLLaMASaveToCache(T* k_dst, const int max_seq_len, \ const int size_per_head, \ const int local_head_num, \ - const int start_pos, \ + const int* start_pos, \ cudaStream_t stream) INSTANTIATESAVETOCACHE(float); INSTANTIATESAVETOCACHE(half); @@ -1981,30 +1979,29 @@ INSTANTIATESAVETOCACHE(__nv_bfloat16); #undef INSTANTIATESAVETOCACHE template -__global__ void transpose_4d_load_from_cache(T* k_dst, - const T* k_src, - T* v_dst, - const T* v_src, - const int head_num, - const int size_per_head, - const int seq_len, - const int max_seq_len, - const int start_pos) +__global__ void transpose_4d_load_from_cache(T* k_dst, + const T* k_src, + T* v_dst, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + const int max_length) { - // [batch_size, head_num, start_pos+seq_len, size_per_head] + // [batch_size, head_num, max_length, size_per_head] const int batch_id = blockIdx.y; const int head_id = blockIdx.z; - const int real_seq_len = start_pos + seq_len; // 16 byte loads will handle "x" dimension auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * max_seq_len + head_id * size_per_head * max_seq_len); - auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * real_seq_len - + head_id * size_per_head * real_seq_len); + auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_length + + head_id * size_per_head * max_length); auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * max_seq_len + head_id * size_per_head * max_seq_len); - auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * real_seq_len - + head_id * size_per_head * real_seq_len); + auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_length + + head_id * size_per_head * max_length); // idx is over output dimension L * size_per_head / x for values const int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -2012,7 +2009,7 @@ __global__ void transpose_4d_load_from_cache(T* k_dst, constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; const int size_per_head_div_x = size_per_head / X_ELEMS; - if (idx >= size_per_head_div_x * real_seq_len) { + if (idx >= size_per_head_div_x * max_length) { return; } @@ -2030,15 +2027,15 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int start_pos, + const int max_length, cudaStream_t stream) { constexpr int block_sz = 128; constexpr int x = (sizeof(T) == 4) ? 4 : 8; - dim3 grid(((start_pos + seq_len) * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + dim3 grid((max_length * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); transpose_4d_load_from_cache<<>>( - k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos); + k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, max_length); } #define INSTANTIATELOADFROMCACHE(T) \ @@ -2051,7 +2048,7 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, \ const int size_per_head, \ const int local_head_num, \ - const int start_pos, \ + const int max_length, \ cudaStream_t stream) INSTANTIATELOADFROMCACHE(float); INSTANTIATELOADFROMCACHE(half); diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 2d4b01dde..c1d85816f 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -125,7 +125,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int start_pos, + const int* start_pos, cudaStream_t stream); template @@ -214,7 +214,7 @@ void invokeLLaMASaveToCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int start_pos, + const int* start_pos, cudaStream_t stream); template void invokeLLaMALoadFromCache(T* k_dst, @@ -226,7 +226,7 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int start_pos, + const int max_length, cudaStream_t stream); template diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index a9989543a..bdf745562 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -30,12 +30,13 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten { // input_tensors: // input_query [token_num, hidden_dimension] - // attention_mask [batch_size, 1, seq_len, seq_len] + // attention_mask [batch_size, 1, seq_len, max_length] // attention_type [1] // layer_id [1], int on cpu + // start_pos, int, [batch_size] + // max_length, int, [batch_size] on cpu // padding_offset, int, [token_num] (optional) // cu_seqlens, int, [batch_size] (optional) - // each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head] // output_tensors: // hidden_features [token_num, hidden_dimension] @@ -49,8 +50,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); const int layer_id = input_tensors->getVal("layer_id"); const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - int start_pos = input_tensors->at("start_pos").max(); + const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const int* start_pos = input_tensors->at("start_pos").getPtr(); + const int max_length = input_tensors->at("max_length").getVal(); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -84,7 +86,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_); + cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_); sync_check_cuda_error(); } invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, @@ -102,6 +104,10 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten stream_); sync_check_cuda_error(); + // std::cout << layer_id << "===============\n"; + // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); + // std::cout << layer_id << "===============\n"; + // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] T* key_cache = output_tensors->getPtr("key_cache"); @@ -129,17 +135,21 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten max_seq_len, size_per_head_, head_num_, - start_pos, + max_length, stream_); + // std::cout << layer_id << "===============\n"; + // print_tensor4(k_buf_2_, batch_size, head_num_, max_length, size_per_head_); + // std::cout << layer_id << "===============\n"; + if (attention_type == AttentionType::FUSED_MHA) { dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); } else { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = seq_len; // q length - const int attention_seq_len_2 = start_pos + seq_len; // kv length + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = max_length; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); // @@ -221,6 +231,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); POP_RANGE; } + //std::cout << layer_id << "===============\n"; + //print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2); + //std::cout << layer_id << "===============\n"; PUSH_RANGE("QK*V batch gemm"); cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, @@ -243,6 +256,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten batch_size * head_num_); sync_check_cuda_error(); + // std::cout << layer_id << "===============\n"; + // print_tensor4(qkv_buf_2_, batch_size, head_num_, attention_seq_len_1, size_per_head_); + // std::cout << layer_id << "===============\n"; // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) if (padding_offset == nullptr) { diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index f626e9a95..52f8c76f0 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -58,22 +58,19 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false)); - normed_decoder_output_buf_ = - (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); value_cache_ = key_cache_ + self_cache_size; - tiled_input_ids_buf_ = - (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * seq_len, false)); - tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false)); - context_decoder_input_buf_ = (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); + is_allocate_buffer_ = true; } @@ -82,16 +79,10 @@ void LLaMA::freeBuffer() { if (is_allocate_buffer_) { allocator_->free((void**)(&input_attention_mask_)); - allocator_->free((void**)(&logits_buf_)); - allocator_->free((void**)(&key_cache_)); - - allocator_->free((void**)(&tiled_input_ids_buf_)); - allocator_->free((void**)(&tiled_input_lengths_buf_)); - allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); - + allocator_->free((void**)(&logits_buf_)); is_allocate_buffer_ = false; } } @@ -203,78 +194,86 @@ void LLaMA::forward(std::unordered_map* output_ten // input_tensors: // input_ids [batch_size, seq_len] // input_lengths [batch_size] - // start_pos [1] int on cpu + // start_pos [batch_size] + // num_tokens [1] int on cpu + // max_length [1] int on cpu // output_tensors: // output_logits [batch_size, seq_len, vocab_size] - FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3"); + FT_CHECK_WITH_INFO(input_tensors->size() == 5, "input_tensors->size() == 5"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); - const size_t batch_size = input_tensors->at("input_ids").shape[0]; - int seq_len = input_tensors->at("input_ids").shape[1]; - - // max cache seq len should include max prefix prompt length as it has k/v states - const int start_pos = input_tensors->at("start_pos").max(); - const cudaDataType_t gemm_data_type = getCudaDataType(); + const DataType data_type = getTensorType(); + const size_t batch_size = input_tensors->at("input_ids").shape[0]; + const int seq_len = input_tensors->at("input_ids").shape[1]; + const int* input_ids = input_tensors->at("input_ids").getPtr(); + const int* start_pos = input_tensors->at("start_pos").getPtr(); + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int num_tokens = input_tensors->at("num_tokens").getVal(0); + const int max_length = input_tensors->at("max_length").getVal(0); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); - const DataType data_type = getTensorType(); - const std::vector self_k_cache_shape = { - num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}; - const std::vector self_v_cache_shape = { - num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}; - - invokeTileGptInputs(tiled_input_ids_buf_, - tiled_input_lengths_buf_, - input_tensors->at("input_ids").getPtr(), - input_tensors->at("input_lengths").getPtr(), - batch_size, - 1, - seq_len, - stream_); - sync_check_cuda_error(); - invokeLLaMABuildDecoderAttentionMask( - input_attention_mask_, tiled_input_lengths_buf_, batch_size, seq_len, start_pos, stream_); + input_attention_mask_, input_lengths, start_pos, batch_size, seq_len, max_length, stream_); sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { invokeInputIdsEmbeddingLookup(context_decoder_input_buf_, llama_weights->pre_decoder_embedding_table, - tiled_input_ids_buf_, + input_ids, seq_len, batch_size, hidden_units_, stream_); sync_check_cuda_error(); } + else { + int data_size = batch_size * seq_len * hidden_units_; + ftNcclRecv(context_decoder_input_buf_, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + sync_check_cuda_error(); + } std::unordered_map decoder_input_tensors{ {"decoder_input", Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}}, {"attention_mask", - Tensor{MEMORY_GPU, - data_type, - {batch_size, 1, (size_t)seq_len, (size_t)(start_pos + seq_len)}, - input_attention_mask_}}, - {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}, - {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}}; + Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, + {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, start_pos}}, + {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}}, + {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; + {"key_cache", + Tensor{MEMORY_GPU, + data_type, + {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + key_cache_}}, + {"value_cache", + Tensor{MEMORY_GPU, + data_type, + {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + value_cache_}}}; llama_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { + if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { + ftNcclSend(context_decoder_output_buf_, + batch_size * seq_len * hidden_units_, + pipeline_para_.rank_ + 1, + pipeline_para_, + stream_); + sync_check_cuda_error(); + } + else { invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 52f969a74..62af1a6d2 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -59,19 +59,15 @@ class LLaMA: public BaseLayer { void initialize(); protected: - T* input_attention_mask_ = nullptr; - T* decoder_output_buf_ = nullptr; - T* normed_decoder_output_buf_ = nullptr; - - T* logits_buf_ = nullptr; + T* input_attention_mask_ = nullptr; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; - T* key_cache_ = nullptr; - T* value_cache_ = nullptr; - - int* tiled_input_ids_buf_ = nullptr; - int* tiled_input_lengths_buf_ = nullptr; + T* decoder_output_buf_ = nullptr; + T* normed_decoder_output_buf_ = nullptr; + T* logits_buf_ = nullptr; - T* context_decoder_input_buf_ = nullptr; + T* context_decoder_input_buf_ = nullptr; T* context_decoder_output_buf_ = nullptr; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 382999c37..1393767ce 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -17,6 +17,7 @@ #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" +#include "src/fastertransformer/kernels/llama_kernels.h" #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" @@ -27,9 +28,6 @@ namespace fastertransformer { template void LLaMAContextDecoder::initialize() { - check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking)); - check_cuda_error(cudaEventCreate(&kern_event_)); - check_cuda_error(cudaEventCreate(&comm_event_)); self_attention_layer_ = new LLaMAContextAttentionLayer(head_num_, size_per_head_, head_num_, @@ -64,20 +62,16 @@ void LLaMAContextDecoder::allocateBuffer() template void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) { + padding_offset_ = + reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); + cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); self_attn_output_ = reinterpret_cast( allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); decoder_layer_output_ = reinterpret_cast( allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - if (layer_output_buffer_ == nullptr) { - layer_output_buffer_ = reinterpret_cast( - allocator_->reMalloc(layer_output_buffer_, sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); - } - h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); - padding_offset_ = - reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); - cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); is_allocate_buffer_ = true; } @@ -88,9 +82,8 @@ void LLaMAContextDecoder::freeBuffer() allocator_->free((void**)(&decoder_normed_input_)); allocator_->free((void**)(&self_attn_output_)); allocator_->free((void**)(&decoder_layer_output_)); - allocator_->free((void**)(&h_pinned_token_num_ptr_), true); - allocator_->free((void**)(&padding_offset_)); allocator_->free((void**)(&cu_seqlens_)); + allocator_->free((void**)(&padding_offset_)); is_allocate_buffer_ = false; } } @@ -173,10 +166,6 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode template LLaMAContextDecoder::~LLaMAContextDecoder() { - check_cuda_error(cudaEventDestroy(kern_event_)); - check_cuda_error(cudaEventDestroy(comm_event_)); - check_cuda_error(cudaStreamDestroy(comm_stream_)); - delete self_attention_layer_; delete ffn_layer_; freeBuffer(); @@ -207,7 +196,9 @@ void LLaMAContextDecoder::forward(std::unordered_map* // decoder_input [batch_size, seq_len, hidden_dimension], // attention_mask [batch_size, 1, seq_len, seq_len] // input_lengths [batch_size] - // start_pos [1] + // start_pos [batch_size] + // num_tokens [1] int on cpu + // max_length [1] int on cpu // output tensors: // decoder_output [batch_size, seq_len, hidden_dimension], @@ -218,14 +209,17 @@ void LLaMAContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 4); + FT_CHECK(input_tensors->size() == 6); FT_CHECK(output_tensors->size() == 3); - - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; - const int start_pos = input_tensors->at("start_pos").max(); - const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; - const DataType data_type = getTensorType(); + const DataType data_type = getTensorType(); + const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); + const int batch_size = input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* start_pos = input_tensors->at("start_pos").getPtr(); + const int max_length = input_tensors->at("max_length").getVal(0); + + const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; allocateBuffer(batch_size, seq_len, max_seq_len); sync_check_cuda_error(); @@ -246,20 +240,15 @@ void LLaMAContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } - AttentionType attention_type = attention_type_; - const bool is_unpadded_mha = isUnPaddedMHA(attention_type); - size_t h_token_num = batch_size * seq_len; if (is_unpadded_mha) { - const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, - &h_token_num, - padding_offset_, - cu_seqlens_, - input_lengths, - batch_size, - seq_len, - stream_); + invokeLLaMAGetPaddingOffsetAndCuSeqLens( + padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); + sync_check_cuda_error(); + + h_token_num = input_tensors->at("num_tokens").getVal(); + + invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); sync_check_cuda_error(); } @@ -268,30 +257,18 @@ void LLaMAContextDecoder::forward(std::unordered_map* continue; } - if (l == 0 && is_unpadded_mha) { - invokeRemovePadding( - decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); - sync_check_cuda_error(); - } - const bool is_final = false; T* layer_input = decoder_layer_output_; T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { - if (l == 0) { + if (isFirstLayerParallelId(l)) { layer_input = decoder_input; } - if (l == num_layer_ - 1) { + if (isLastLayerParallelId(l)) { layer_output = decoder_output; } } - if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_; - ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - sync_check_cuda_error(); - } - invokeGeneralLLaMALayerNorm(decoder_normed_input_, layer_input, llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, @@ -306,11 +283,14 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"attention_mask", Tensor{MEMORY_GPU, data_type, - {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len)}, + {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(max_length)}, attention_mask}}, - {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, - {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}}; + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}}, + {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, start_pos}}, + {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}}, + }; if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", @@ -333,6 +313,11 @@ void LLaMAContextDecoder::forward(std::unordered_map* &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); + //std::cout << l << "===============\n"; + //print_tensor2(self_attn_output_, h_token_num, hidden_units_); + //std::cout << l << "===============\n"; + + invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, decoder_normed_input_, @@ -362,32 +347,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); + } - if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 - && pipeline_para_.world_size_ > 1) { - int data_size = h_token_num * hidden_units_; - check_cuda_error(cudaEventSynchronize(comm_event_)); - check_cuda_error(cudaMemcpyAsync( - layer_output_buffer_, layer_output, sizeof(T) * data_size, cudaMemcpyDeviceToDevice, stream_)); - check_cuda_error(cudaEventRecord(kern_event_, stream_)); - check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_)); - ftNcclSend(layer_output_buffer_, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_); - sync_check_cuda_error(); - check_cuda_error(cudaEventRecord(comm_event_, comm_stream_)); - - //ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_); - //sync_check_cuda_error(); - } - - if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output, - decoder_layer_output_, - padding_offset_, - h_token_num, - head_num_ * size_per_head_, - stream_); - sync_check_cuda_error(); - } + if (is_unpadded_mha) { + invokeRebuildPadding( + decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); + sync_check_cuda_error(); } if (is_free_buffer_after_forward_ == true) { diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index 7a4866ddc..d76ff0687 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -43,10 +43,6 @@ class LLaMAContextDecoder: public BaseLayer { size_t rotary_embedding_dim_; float layernorm_eps_; - cudaEvent_t kern_event_; - cudaEvent_t comm_event_; - cudaStream_t comm_stream_; - // calculated data size_t hidden_units_; @@ -71,13 +67,12 @@ class LLaMAContextDecoder: public BaseLayer { void initialize(); protected: - T* layer_output_buffer_ = nullptr; - T* decoder_normed_input_ = nullptr; - T* self_attn_output_ = nullptr; - T* decoder_layer_output_ = nullptr; - size_t* h_pinned_token_num_ptr_ = nullptr; - int* padding_offset_ = nullptr; - int* cu_seqlens_ = nullptr; + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + size_t* h_pinned_token_num_ptr_ = nullptr; public: LLaMAContextDecoder(size_t head_num, diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 45c1e1575..e2b819c4b 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -30,8 +30,7 @@ LLaMA::LLaMA(const int64_t num_heads, const int64_t tensor_para_size, const int64_t pipeline_para_size, const vector weights): - vocab_size_(vocab_size), - st_(weights[0].scalar_type()) + vocab_size_(vocab_size), st_(weights[0].scalar_type()) { for (auto t : weights) { CHECK_INPUT(t, st_); @@ -74,8 +73,12 @@ LLaMA::~LLaMA() delete ftllama; } -th::Tensor -LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) +th::Tensor LLaMA::forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& start_pos, + const int64_t num_tokens, + const int64_t max_length) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -84,11 +87,9 @@ LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t s CHECK_CONTIGUOUS(input_lengths); TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); - const int batch_size = input_ids.size(0); - const int seq_len = input_ids.size(1); - th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_}, - torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false)); - ftllama->forward(output_logits, input_ids, input_lengths, (int)start_pos); + const int batch_size = input_ids.size(0); + const int seq_len = input_ids.size(1); + ftllama->forward(output_logits, input_ids, input_lengths, start_pos, num_tokens, max_length); return output_logits; } diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 597279b92..237728c1d 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -29,8 +29,12 @@ using std::vector; class IFLLaMA { public: virtual ~IFLLaMA() {} - virtual void - forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) = 0; + virtual void forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& start_pos, + const int num_tokens, + const int max_length) = 0; }; template @@ -167,8 +171,12 @@ class FTLLaMA: public IFLLaMA { delete cublas_wrapper_mutex_; } - virtual void - forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override + virtual void forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& start_pos, + const int num_tokens, + const int max_length) override { const size_t batch_size = (size_t)input_ids.size(0); @@ -180,7 +188,10 @@ class FTLLaMA: public IFLLaMA { ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size, seq_len}, get_ptr(input_ids)}}, {"input_lengths", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, - {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &start_pos}}}; + {"start_pos", + ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(start_pos)}}, + {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &num_tokens}}, + {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &max_length}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_logits", @@ -220,7 +231,7 @@ class FTLLaMA: public IFLLaMA { int64_t pipeline_para_size_; cudaStream_t stream_; - cudaEvent_t event_; + cudaEvent_t event_; std::vector weights_; cublasLtHandle_t cublasltHandle_; @@ -253,7 +264,12 @@ class LLaMA: public th::jit::CustomClassHolder { ~LLaMA(); - th::Tensor forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos); + th::Tensor forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& start_pos, + const int64_t num_tokens, + const int64_t max_length); private: const at::ScalarType st_; diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h index 962a47764..deed71f2c 100644 --- a/src/fastertransformer/utils/llama_utils.h +++ b/src/fastertransformer/utils/llama_utils.h @@ -131,15 +131,21 @@ static void print_tensor1(T* in, int dim1) } template -static void print_tensor2(T* in, int dim1, int dim2) +static void print_tensor2(T* in, int dim1, int dim2, int stride1, int size, int start) { - T* out = (T*)malloc(sizeof(T) * dim1 * dim2); - cudaMemcpy(out, in, sizeof(T) * dim1 * dim2, cudaMemcpyDeviceToHost); - _print_tensor2(out, dim1, dim2, dim2, 1); + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor2(&out[start], dim1, dim2, stride1, 1); std::cout << "\n"; free(out); } +template +static void print_tensor2(T* in, int dim1, int dim2) +{ + print_tensor2(in, dim1, dim2, dim2, dim1 * dim2, 0); +} + template static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start) { From 083f3bb74d6535edf794b97be1d9475d09e3f748 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 29 Sep 2023 00:50:46 +0000 Subject: [PATCH 39/55] get back start_pos --- .../kernels/llama_kernels.cu | 1 - .../kernels/unfused_attention_kernels.cu | 14 ++++----- .../LLaMAContextAttentionLayer.cc | 29 ++++++++++--------- src/fastertransformer/models/llama/LLaMA.cc | 22 +++++++------- .../models/llama/LLaMAContextDecoder.cc | 17 ++++++----- src/fastertransformer/th_op/llama/LLaMA.cc | 4 +-- src/fastertransformer/th_op/llama/LLaMA.h | 10 +++---- 7 files changed, 49 insertions(+), 48 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 5379eda1d..360adeab3 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -53,7 +53,6 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel(T* attention_mask, attention_mask += batch_idx * mask_size_per_seq; const int context_length = context_lengths[batch_idx]; const int length = sequence_lengths[batch_idx]; - const int offset = max_length - length; for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) { int row_id = i / max_length; diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 134d63921..3259b66df 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1592,7 +1592,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int* start_pos) + const int* context_lengths) { constexpr int vec_size = Vec_t::size; using Vec_t = typename Vec_t::Type; @@ -1622,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, v = *reinterpret_cast(&QKV[src_v_idx]); } - mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos[batch_idx] + seq_idx); + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, context_lengths[batch_idx] + seq_idx); const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + seq_idx * size_per_head + tidx * vec_size; @@ -1649,7 +1649,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int* start_pos, + const int* context_lengths, cudaStream_t stream) { dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -1664,7 +1664,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, head_num, size_per_head, rotary_embedding_dim, - start_pos); + context_lengths); } template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, @@ -1678,7 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int* start_pos, + const int* context_lengths, cudaStream_t stream); template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, @@ -1692,7 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int* start_pos, + const int* context_lengths, cudaStream_t stream); #ifdef ENABLE_BF16 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, @@ -1706,7 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, const int head_num, const int size_per_head, const int rotary_embedding_dim, - const int* start_pos, + const int* context_lengths, cudaStream_t stream); #endif diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index bdf745562..f9d6e2838 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -33,7 +33,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // attention_mask [batch_size, 1, seq_len, max_length] // attention_type [1] // layer_id [1], int on cpu - // start_pos, int, [batch_size] + // context_lengths, int, [batch_size] // max_length, int, [batch_size] on cpu // padding_offset, int, [token_num] (optional) // cu_seqlens, int, [batch_size] (optional) @@ -42,17 +42,18 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // hidden_features [token_num, hidden_dimension] // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] + FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); - const int batch_size = input_tensors->at("attention_mask").shape[0]; - const int seq_len = input_tensors->at("attention_mask").shape[2]; - const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); - const int layer_id = input_tensors->getVal("layer_id"); - const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - const int* start_pos = input_tensors->at("start_pos").getPtr(); - const int max_length = input_tensors->at("max_length").getVal(); + const int batch_size = input_tensors->at("attention_mask").shape[0]; + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); + const int layer_id = input_tensors->getVal("layer_id"); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int max_length = input_tensors->at("max_length").getVal(); T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -100,7 +101,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten head_num_, size_per_head_, rotary_embedding_dim_, - start_pos, + context_lengths, stream_); sync_check_cuda_error(); @@ -121,7 +122,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten max_seq_len, size_per_head_, head_num_, - start_pos, + context_lengths, stream_); sync_check_cuda_error(); POP_RANGE; @@ -231,9 +232,9 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); POP_RANGE; } - //std::cout << layer_id << "===============\n"; - //print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2); - //std::cout << layer_id << "===============\n"; + // std::cout << layer_id << "===============\n"; + // print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2); + // std::cout << layer_id << "===============\n"; PUSH_RANGE("QK*V batch gemm"); cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 52f8c76f0..2a04732a5 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -194,7 +194,7 @@ void LLaMA::forward(std::unordered_map* output_ten // input_tensors: // input_ids [batch_size, seq_len] // input_lengths [batch_size] - // start_pos [batch_size] + // context_lengths [batch_size] // num_tokens [1] int on cpu // max_length [1] int on cpu @@ -205,20 +205,20 @@ void LLaMA::forward(std::unordered_map* output_ten FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); - const DataType data_type = getTensorType(); - const size_t batch_size = input_tensors->at("input_ids").shape[0]; - const int seq_len = input_tensors->at("input_ids").shape[1]; - const int* input_ids = input_tensors->at("input_ids").getPtr(); - const int* start_pos = input_tensors->at("start_pos").getPtr(); - const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - const int num_tokens = input_tensors->at("num_tokens").getVal(0); - const int max_length = input_tensors->at("max_length").getVal(0); + const DataType data_type = getTensorType(); + const size_t batch_size = input_tensors->at("input_ids").shape[0]; + const int seq_len = input_tensors->at("input_ids").shape[1]; + const int* input_ids = input_tensors->at("input_ids").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int num_tokens = input_tensors->at("num_tokens").getVal(0); + const int max_length = input_tensors->at("max_length").getVal(0); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); invokeLLaMABuildDecoderAttentionMask( - input_attention_mask_, input_lengths, start_pos, batch_size, seq_len, max_length, stream_); + input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, max_length, stream_); sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { @@ -243,7 +243,7 @@ void LLaMA::forward(std::unordered_map* output_ten {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, - {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, start_pos}}, + {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}}, {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}}, {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}}; diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 1393767ce..7946812b9 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -179,7 +179,9 @@ void LLaMAContextDecoder::forward(std::vector* std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, {"attention_mask", input_tensors->at(1)}, {"input_lengths", input_tensors->at(2)}, - {"start_pos", input_tensors->at(3)}}; + {"context_lengths", input_tensors->at(3)}, + {"num_tokens", input_tensors->at(4)}, + {"max_length", input_tensors->at(5)}}; std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, {"key_cache", output_tensors->at(1)}, {"value_cache", output_tensors->at(2)}}; @@ -196,7 +198,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* // decoder_input [batch_size, seq_len, hidden_dimension], // attention_mask [batch_size, 1, seq_len, seq_len] // input_lengths [batch_size] - // start_pos [batch_size] + // context_lengths [batch_size] // num_tokens [1] int on cpu // max_length [1] int on cpu @@ -216,7 +218,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* const int batch_size = input_tensors->at("decoder_input").shape[0]; const int seq_len = input_tensors->at("decoder_input").shape[1]; const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - const int* start_pos = input_tensors->at("start_pos").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); const int max_length = input_tensors->at("max_length").getVal(0); const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; @@ -288,7 +290,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}}, - {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, start_pos}}, + {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}}, {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}}, }; @@ -313,10 +315,9 @@ void LLaMAContextDecoder::forward(std::unordered_map* &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - //std::cout << l << "===============\n"; - //print_tensor2(self_attn_output_, h_token_num, hidden_units_); - //std::cout << l << "===============\n"; - + // std::cout << l << "===============\n"; + // print_tensor2(self_attn_output_, h_token_num, hidden_units_); + // std::cout << l << "===============\n"; invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index e2b819c4b..8aada6aff 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -76,7 +76,7 @@ LLaMA::~LLaMA() th::Tensor LLaMA::forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, - th::Tensor& start_pos, + th::Tensor& context_lengths, const int64_t num_tokens, const int64_t max_length) { @@ -89,7 +89,7 @@ th::Tensor LLaMA::forward(th::Tensor& output_logits, const int batch_size = input_ids.size(0); const int seq_len = input_ids.size(1); - ftllama->forward(output_logits, input_ids, input_lengths, start_pos, num_tokens, max_length); + ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, max_length); return output_logits; } diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 237728c1d..365a07dd1 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -32,7 +32,7 @@ class IFLLaMA { virtual void forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, - th::Tensor& start_pos, + th::Tensor& context_lengths, const int num_tokens, const int max_length) = 0; }; @@ -174,7 +174,7 @@ class FTLLaMA: public IFLLaMA { virtual void forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, - th::Tensor& start_pos, + th::Tensor& context_lengths, const int num_tokens, const int max_length) override { @@ -188,8 +188,8 @@ class FTLLaMA: public IFLLaMA { ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size, seq_len}, get_ptr(input_ids)}}, {"input_lengths", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, - {"start_pos", - ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(start_pos)}}, + {"context_lengths", + ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &num_tokens}}, {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &max_length}}}; @@ -267,7 +267,7 @@ class LLaMA: public th::jit::CustomClassHolder { th::Tensor forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, - th::Tensor& start_pos, + th::Tensor& context_lengths, const int64_t num_tokens, const int64_t max_length); From f08ada9154c1203ff51590424a1d7ace9d08924d Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 29 Sep 2023 13:19:53 +0000 Subject: [PATCH 40/55] debug --- .../kernels/unfused_attention_kernels.cu | 10 +++++----- .../LLaMAContextAttentionLayer.cc | 17 ++--------------- src/fastertransformer/models/llama/LLaMA.cc | 1 + .../models/llama/LLaMAContextDecoder.cc | 4 ---- 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 3259b66df..c0d673c2a 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1906,7 +1906,7 @@ __global__ void transpose_4d_save_to_cache(T* k_dst, const int size_per_head, const int seq_len, const int max_seq_len, - const int* start_pos) + const int* context_lengths) { // [batch_size, head_num, seq_len, size_per_head] const int batch_id = blockIdx.y; @@ -1917,12 +1917,12 @@ __global__ void transpose_4d_save_to_cache(T* k_dst, + head_id * size_per_head * seq_len); auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_seq_len - + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head); + + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head); auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * seq_len + head_id * size_per_head * seq_len); auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_seq_len - + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head); + + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head); // idx is over output dimension L * size_per_head / x for values const int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -1948,7 +1948,7 @@ void invokeLLaMASaveToCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int* start_pos, + const int* context_lengths, cudaStream_t stream) { constexpr int block_sz = 128; @@ -1956,7 +1956,7 @@ void invokeLLaMASaveToCache(T* k_dst, dim3 grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); transpose_4d_save_to_cache<<>>( - k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos); + k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, context_lengths); } #define INSTANTIATESAVETOCACHE(T) \ diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index f9d6e2838..5557454a5 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -42,7 +42,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // hidden_features [token_num, hidden_dimension] // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] - + FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); @@ -105,10 +105,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten stream_); sync_check_cuda_error(); - // std::cout << layer_id << "===============\n"; - // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_); - // std::cout << layer_id << "===============\n"; - // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] T* key_cache = output_tensors->getPtr("key_cache"); @@ -139,10 +135,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten max_length, stream_); - // std::cout << layer_id << "===============\n"; - // print_tensor4(k_buf_2_, batch_size, head_num_, max_length, size_per_head_); - // std::cout << layer_id << "===============\n"; - if (attention_type == AttentionType::FUSED_MHA) { dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); @@ -183,6 +175,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); POP_RANGE; + PUSH_RANGE("softmax"); MaskedSoftmaxParam param; param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) @@ -232,9 +225,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); POP_RANGE; } - // std::cout << layer_id << "===============\n"; - // print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2); - // std::cout << layer_id << "===============\n"; PUSH_RANGE("QK*V batch gemm"); cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, @@ -257,9 +247,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten batch_size * head_num_); sync_check_cuda_error(); - // std::cout << layer_id << "===============\n"; - // print_tensor4(qkv_buf_2_, batch_size, head_num_, attention_seq_len_1, size_per_head_); - // std::cout << layer_id << "===============\n"; // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) if (padding_offset == nullptr) { diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 2a04732a5..95282ba1c 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -265,6 +265,7 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); + if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { ftNcclSend(context_decoder_output_buf_, batch_size * seq_len * hidden_units_, diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 7946812b9..9eaa81758 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -315,10 +315,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* &self_attention_input_tensors, &llama_decoder_layer_weight->at(l)->self_attention_weights); - // std::cout << l << "===============\n"; - // print_tensor2(self_attn_output_, h_token_num, hidden_units_); - // std::cout << l << "===============\n"; - invokeGeneralLLaMAAddBiasResidualPreLayerNorm( self_attn_output_, decoder_normed_input_, From e5d92df821f27e090e1bb8019e9bdd3b89f499ff Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 29 Sep 2023 19:58:24 +0000 Subject: [PATCH 41/55] chkpt --- .../kernels/llama_kernels.cu | 65 ++++++++-- src/fastertransformer/kernels/llama_kernels.h | 7 ++ .../kernels/unfused_attention_kernels.cu | 22 ++-- .../kernels/unfused_attention_kernels.h | 2 +- .../LLaMAContextAttentionLayer.cc | 31 +++-- src/fastertransformer/models/llama/LLaMA.cc | 118 ++++++++++++------ src/fastertransformer/models/llama/LLaMA.h | 17 ++- .../models/llama/LLaMAContextDecoder.cc | 60 ++++----- src/fastertransformer/th_op/llama/LLaMA.cc | 8 +- src/fastertransformer/th_op/llama/LLaMA.h | 39 +++--- 10 files changed, 240 insertions(+), 129 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 360adeab3..95700cb1f 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -1,12 +1,59 @@ #include "src/fastertransformer/kernels/llama_kernels.h" #include "src/fastertransformer/utils/cuda_fp8_utils.h" +#include + #include #include #include +using namespace std; namespace fastertransformer { +template +__global__ void LLaMAstart_id_embedding_lookups_kernel( + T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_tokens * hidden_units; + index += blockDim.x * gridDim.x) { + + // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate + // embedding [batch, length, hidden] + const int word_index = index / hidden_units; + const int col_index = index % hidden_units; + const int input_id = input_ids[word_index]; + + out[index] = embedding_table[input_id * hidden_units + col_index]; + } +} + +template +void invokeLLaMAInputIdsEmbeddingLookup(T* out, + const T* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream) +{ + dim3 grid(min(num_tokens, 65536)); + dim3 block(min(hidden_units, 512)); + LLaMAstart_id_embedding_lookups_kernel + <<>>(out, embedding_table, input_ids, num_tokens, hidden_units); +} + +template void invokeLLaMAInputIdsEmbeddingLookup(float* out, + const float* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); +template void invokeLLaMAInputIdsEmbeddingLookup(half* out, + const half* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); + __global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel( int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len) { @@ -44,19 +91,19 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel(T* attention_mask, const int* context_lengths, const int batch_size, const int seq_len, - const int max_length) + const int attn_len) { // attention_mask: - // [batch_size, 1, seq_len, max_length] + // [batch_size, 1, seq_len, attn_len] const int batch_idx = blockIdx.x; - const int mask_size_per_seq = seq_len * max_length; + const int mask_size_per_seq = seq_len * attn_len; attention_mask += batch_idx * mask_size_per_seq; const int context_length = context_lengths[batch_idx]; const int length = sequence_lengths[batch_idx]; for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) { - int row_id = i / max_length; - int col_id = i % max_length; + int row_id = i / attn_len; + int col_id = i % attn_len; if (row_id < length && col_id <= (row_id + context_length)) { attention_mask[i] = (T)(1.0f); } @@ -72,11 +119,11 @@ void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, const int* context_lengths, const int batch_size, const int seq_len, - const int max_length, + const int attn_len, cudaStream_t stream) { LLaMAbuildDecoderAttentionMaskKernel<<>>( - attention_mask, sequence_length, context_lengths, batch_size, seq_len, max_length); + attention_mask, sequence_length, context_lengths, batch_size, seq_len, attn_len); } template void invokeLLaMABuildDecoderAttentionMask(float* attention_mask, @@ -84,7 +131,7 @@ template void invokeLLaMABuildDecoderAttentionMask(float* attention_mask, const int* context_lengths, const int batch_size, const int seq_len, - const int max_length, + const int attn_len, cudaStream_t stream); template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, @@ -92,7 +139,7 @@ template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, const int* context_lengths, const int batch_size, const int seq_len, - const int max_length, + const int attn_len, cudaStream_t stream); template diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index a218b40d1..2d1c9592e 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -19,6 +19,13 @@ void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, const int seq_len, const int max_length, cudaStream_t stream); +template +void invokeLLaMAInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); template void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index c0d673c2a..2f867186e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1987,21 +1987,21 @@ __global__ void transpose_4d_load_from_cache(T* k_dst, const int size_per_head, const int seq_len, const int max_seq_len, - const int max_length) + const int attn_len) { - // [batch_size, head_num, max_length, size_per_head] + // [batch_size, head_num, attn_len, size_per_head] const int batch_id = blockIdx.y; const int head_id = blockIdx.z; // 16 byte loads will handle "x" dimension auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * max_seq_len + head_id * size_per_head * max_seq_len); - auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_length - + head_id * size_per_head * max_length); + auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * attn_len + + head_id * size_per_head * attn_len); auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * max_seq_len + head_id * size_per_head * max_seq_len); - auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_length - + head_id * size_per_head * max_length); + auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * attn_len + + head_id * size_per_head * attn_len); // idx is over output dimension L * size_per_head / x for values const int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -2009,7 +2009,7 @@ __global__ void transpose_4d_load_from_cache(T* k_dst, constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; const int size_per_head_div_x = size_per_head / X_ELEMS; - if (idx >= size_per_head_div_x * max_length) { + if (idx >= size_per_head_div_x * attn_len) { return; } @@ -2027,15 +2027,15 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int max_length, + const int attn_len, cudaStream_t stream) { constexpr int block_sz = 128; constexpr int x = (sizeof(T) == 4) ? 4 : 8; - dim3 grid((max_length * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + dim3 grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); transpose_4d_load_from_cache<<>>( - k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, max_length); + k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, attn_len); } #define INSTANTIATELOADFROMCACHE(T) \ @@ -2048,7 +2048,7 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, \ const int size_per_head, \ const int local_head_num, \ - const int max_length, \ + const int attn_len, \ cudaStream_t stream) INSTANTIATELOADFROMCACHE(float); INSTANTIATELOADFROMCACHE(half); diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index c1d85816f..52fa0f053 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -226,7 +226,7 @@ void invokeLLaMALoadFromCache(T* k_dst, const int max_seq_len, const int size_per_head, const int local_head_num, - const int max_length, + const int attn_len, cudaStream_t stream); template diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 5557454a5..0c2307fc8 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -29,17 +29,17 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten const AttentionWeight* attention_weights) { // input_tensors: - // input_query [token_num, hidden_dimension] - // attention_mask [batch_size, 1, seq_len, max_length] + // input_query [num_tokens, hidden_dimension] + // attention_mask [batch_size, 1, seq_len, attn_len] // attention_type [1] // layer_id [1], int on cpu // context_lengths, int, [batch_size] - // max_length, int, [batch_size] on cpu - // padding_offset, int, [token_num] (optional) + // attn_len, int, [batch_size] on cpu + // padding_offset, int, [num_tokens] (optional) // cu_seqlens, int, [batch_size] (optional) // output_tensors: - // hidden_features [token_num, hidden_dimension] + // hidden_features [num_tokens, hidden_dimension] // key_cache [batch, local_head_num, max_seq_len, size_per_head] // value_cache [batch, local_head_num, max_seq_len, size_per_head] @@ -47,13 +47,13 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); const int batch_size = input_tensors->at("attention_mask").shape[0]; - const int seq_len = input_tensors->at("attention_mask").shape[2]; const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); const int layer_id = input_tensors->getVal("layer_id"); const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); const int* context_lengths = input_tensors->at("context_lengths").getPtr(); - const int max_length = input_tensors->at("max_length").getVal(); + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; T* attention_out = output_tensors->at("hidden_features").getPtr(); T* attention_input = input_tensors->at("input_query").getPtr(); @@ -68,14 +68,14 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten POP_RANGE; sync_check_cuda_error(); - const int m = input_tensors->at("input_query").shape[0]; + const int num_tokens = input_tensors->at("input_query").shape[0]; PUSH_RANGE("qkv_gemm"); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, 3 * hidden_units_, // n - m, + num_tokens, hidden_units_, // k attention_weights->query_weight.kernel, 3 * hidden_units_, // n @@ -97,7 +97,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten padding_offset, batch_size, seq_len, - m, + num_tokens, head_num_, size_per_head_, rotary_embedding_dim_, @@ -132,7 +132,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten max_seq_len, size_per_head_, head_num_, - max_length, + attn_len, stream_); if (attention_type == AttentionType::FUSED_MHA) { @@ -141,8 +141,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten } else { const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = seq_len; // q length - const int attention_seq_len_2 = max_length; // kv length + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = attn_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); // @@ -175,7 +175,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten sync_check_cuda_error(); POP_RANGE; - PUSH_RANGE("softmax"); MaskedSoftmaxParam param; param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) @@ -264,7 +263,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten else { invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, qkv_buf_3_, - m, + num_tokens, batch_size, attention_seq_len_1, head_num_, @@ -283,7 +282,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, hidden_units_, - m, + num_tokens, hidden_units_, attention_weights->attention_output_weight.kernel, hidden_units_, diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 95282ba1c..fb71c966f 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -29,6 +29,11 @@ namespace fastertransformer { template void LLaMA::initialize() { + check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking)); + for (int i = 0; i < num_buffers_; ++i) { + check_cuda_error(cudaEventCreate(&kern_event_[i])); + check_cuda_error(cudaEventCreate(&comm_event_[i])); + } llama_context_decoder_ = new LLaMAContextDecoder(head_num_, size_per_head_, inter_size_, @@ -67,6 +72,11 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + for (int i = 0; i < num_buffers_; ++i) { + context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc( + context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); + } + normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); @@ -82,6 +92,9 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&key_cache_)); allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); + for (int i = 0; i < num_buffers_; ++i) { + allocator_->free((void**)(&context_decoder_output_buf_clone_[i])); + } allocator_->free((void**)(&logits_buf_)); is_allocate_buffer_ = false; } @@ -173,6 +186,12 @@ LLaMA::LLaMA(LLaMA const& llama): template LLaMA::~LLaMA() { + check_cuda_error(cudaStreamDestroy(comm_stream_)); + for (int i = 0; i < num_buffers_; ++i) { + check_cuda_error(cudaEventDestroy(kern_event_[i])); + check_cuda_error(cudaEventDestroy(comm_event_[i])); + } + delete llama_context_decoder_; freeBuffer(); } @@ -192,64 +211,66 @@ void LLaMA::forward(std::unordered_map* output_ten { // // input_tensors: - // input_ids [batch_size, seq_len] + // input_ids [num_tokens] // input_lengths [batch_size] // context_lengths [batch_size] // num_tokens [1] int on cpu - // max_length [1] int on cpu + // seq_len [1] int on cpu + // attn_len [1] int on cpu // output_tensors: - // output_logits [batch_size, seq_len, vocab_size] + // output_logits [num_tokens, vocab_size] - FT_CHECK_WITH_INFO(input_tensors->size() == 5, "input_tensors->size() == 5"); + FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); const DataType data_type = getTensorType(); - const size_t batch_size = input_tensors->at("input_ids").shape[0]; - const int seq_len = input_tensors->at("input_ids").shape[1]; + const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); + const size_t batch_size = input_tensors->at("input_lengths").shape[0]; const int* input_ids = input_tensors->at("input_ids").getPtr(); const int* context_lengths = input_tensors->at("context_lengths").getPtr(); const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - const int num_tokens = input_tensors->at("num_tokens").getVal(0); - const int max_length = input_tensors->at("max_length").getVal(0); + const int num_tokens = input_tensors->at("num_tokens").getVal(); + const int seq_len = input_tensors->at("seq_len").getVal(); + const int attn_len = input_tensors->at("attn_len").getVal(); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); invokeLLaMABuildDecoderAttentionMask( - input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, max_length, stream_); + input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_); sync_check_cuda_error(); if (pipeline_para_.rank_ == 0) { - invokeInputIdsEmbeddingLookup(context_decoder_input_buf_, - llama_weights->pre_decoder_embedding_table, - input_ids, - seq_len, - batch_size, - hidden_units_, - stream_); + invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_, + llama_weights->pre_decoder_embedding_table, + input_ids, + num_tokens, + hidden_units_, + stream_); sync_check_cuda_error(); } else { - int data_size = batch_size * seq_len * hidden_units_; - ftNcclRecv(context_decoder_input_buf_, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + ftNcclRecv( + context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); sync_check_cuda_error(); } std::unordered_map decoder_input_tensors{ {"decoder_input", - Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}}, + Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_input_buf_}}, {"attention_mask", - Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}}, + Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}}, {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}}, - {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}}; + {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}}, + {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}}; std::unordered_map decoder_output_tensors{ {"decoder_output", - Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}}, + Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}}, {"key_cache", Tensor{MEMORY_GPU, data_type, @@ -265,13 +286,22 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { - ftNcclSend(context_decoder_output_buf_, - batch_size * seq_len * hidden_units_, + buf_no_ = (buf_no_ + 1) % num_buffers_; + check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); + invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], + context_decoder_output_buf_, + num_tokens * hidden_units_, + stream_); + sync_check_cuda_error(); + check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); + check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + ftNcclSend(context_decoder_output_buf_clone_[buf_no_], + num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, - stream_); + comm_stream_); + check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); sync_check_cuda_error(); } else { @@ -279,29 +309,47 @@ void LLaMA::forward(std::unordered_map* output_ten context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, layernorm_eps_, - batch_size * seq_len, + num_tokens, hidden_units_, stream_); sync_check_cuda_error(); + float alpha = 1.0f; + float beta = 0.0f; + float* output_logits = output_tensors->at("output_logits").getPtr(); + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, vocab_size_, - batch_size * seq_len, + num_tokens, hidden_units_, llama_weights->post_decoder_embedding.kernel, vocab_size_, normed_decoder_output_buf_, hidden_units_, // n - logits_buf_, + output_logits, vocab_size_); sync_check_cuda_error(); - - if (std::is_same::value) { - float* output_logits = output_tensors->at("output_logits").getPtr(); - invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_); - sync_check_cuda_error(); - } + cublas_wrapper_->setFP16GemmConfig(); + + // cublas_wrapper_->Gemm(CUBLAS_OP_N, + // CUBLAS_OP_N, + // vocab_size_, + // num_tokens, + // hidden_units_, + // llama_weights->post_decoder_embedding.kernel, + // vocab_size_, + // normed_decoder_output_buf_, + // hidden_units_, // n + // logits_buf_, + // vocab_size_); + // sync_check_cuda_error(); + // + // if (std::is_same::value) { + // float* output_logits = output_tensors->at("output_logits").getPtr(); + // invokeCudaCast(output_logits, logits_buf_, num_tokens * vocab_size_, stream_); + // sync_check_cuda_error(); + // } } } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 62af1a6d2..34e8c7ae9 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -38,6 +38,12 @@ class LLaMA: public BaseLayer { size_t random_seed_; size_t max_seq_len_; + static constexpr int num_buffers_ = 5; + int buf_no_ = 0; + cudaStream_t comm_stream_; + cudaEvent_t kern_event_[num_buffers_]; + cudaEvent_t comm_event_[num_buffers_]; + static constexpr float layernorm_eps_ = 1e-6f; size_t hidden_units_; @@ -59,16 +65,17 @@ class LLaMA: public BaseLayer { void initialize(); protected: - T* input_attention_mask_ = nullptr; - T* key_cache_ = nullptr; - T* value_cache_ = nullptr; + T* input_attention_mask_ = nullptr; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; T* decoder_output_buf_ = nullptr; T* normed_decoder_output_buf_ = nullptr; T* logits_buf_ = nullptr; - T* context_decoder_input_buf_ = nullptr; - T* context_decoder_output_buf_ = nullptr; + T* context_decoder_input_buf_ = nullptr; + T* context_decoder_output_buf_ = nullptr; + T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 9eaa81758..0294c58d4 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -181,7 +181,8 @@ void LLaMAContextDecoder::forward(std::vector* {"input_lengths", input_tensors->at(2)}, {"context_lengths", input_tensors->at(3)}, {"num_tokens", input_tensors->at(4)}, - {"max_length", input_tensors->at(5)}}; + {"seq_len", input_tensors->at(5)}, + {"attn_len", input_tensors->at(6)}}; std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, {"key_cache", output_tensors->at(1)}, {"value_cache", output_tensors->at(2)}}; @@ -195,31 +196,29 @@ void LLaMAContextDecoder::forward(std::unordered_map* const std::vector*>* llama_decoder_layer_weight) { // input tensors: - // decoder_input [batch_size, seq_len, hidden_dimension], - // attention_mask [batch_size, 1, seq_len, seq_len] + // decoder_input [num_tokens, hidden_dimension], + // attention_mask [batch_size, 1, seq_len, attn_len] // input_lengths [batch_size] // context_lengths [batch_size] // num_tokens [1] int on cpu - // max_length [1] int on cpu + // seq_len [1] int on cpu + // attn_len [1] int on cpu // output tensors: - // decoder_output [batch_size, seq_len, hidden_dimension], + // decoder_output [num_tokens, hidden_dimension], // key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] // value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head] - // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'. - // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during - // computing. - - FT_CHECK(input_tensors->size() == 6); + FT_CHECK(input_tensors->size() == 7); FT_CHECK(output_tensors->size() == 3); const DataType data_type = getTensorType(); const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int batch_size = input_tensors->at("input_lengths").shape[0]; const int* input_lengths = input_tensors->at("input_lengths").getPtr(); const int* context_lengths = input_tensors->at("context_lengths").getPtr(); - const int max_length = input_tensors->at("max_length").getVal(0); + const int num_tokens = input_tensors->at("num_tokens").getVal(); + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; allocateBuffer(batch_size, seq_len, max_seq_len); @@ -248,10 +247,11 @@ void LLaMAContextDecoder::forward(std::unordered_map* padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); sync_check_cuda_error(); - h_token_num = input_tensors->at("num_tokens").getVal(); + h_token_num = num_tokens; - invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_); - sync_check_cuda_error(); + // invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, + // stream_); + // sync_check_cuda_error(); } for (int l = 0; l < num_layer_; l++) { @@ -262,14 +262,14 @@ void LLaMAContextDecoder::forward(std::unordered_map* const bool is_final = false; T* layer_input = decoder_layer_output_; T* layer_output = decoder_layer_output_; - if (!is_unpadded_mha) { - if (isFirstLayerParallelId(l)) { - layer_input = decoder_input; - } - if (isLastLayerParallelId(l)) { - layer_output = decoder_output; - } + // if (!is_unpadded_mha) { + if (isFirstLayerParallelId(l)) { + layer_input = decoder_input; + } + if (isLastLayerParallelId(l)) { + layer_output = decoder_output; } + // } invokeGeneralLLaMALayerNorm(decoder_normed_input_, layer_input, @@ -285,13 +285,13 @@ void LLaMAContextDecoder::forward(std::unordered_map* {"attention_mask", Tensor{MEMORY_GPU, data_type, - {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(max_length)}, + {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)}, attention_mask}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}}, - {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}}, + {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &attn_len}}, }; if (is_unpadded_mha) { @@ -346,11 +346,11 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); } - if (is_unpadded_mha) { - invokeRebuildPadding( - decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); - sync_check_cuda_error(); - } + // if (is_unpadded_mha) { + // invokeRebuildPadding( + // decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); + // sync_check_cuda_error(); + // } if (is_free_buffer_after_forward_ == true) { freeBuffer(); diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 8aada6aff..1e260eec6 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -78,7 +78,8 @@ th::Tensor LLaMA::forward(th::Tensor& output_logits, th::Tensor& input_lengths, th::Tensor& context_lengths, const int64_t num_tokens, - const int64_t max_length) + const int64_t seq_len, + const int64_t attn_len) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -87,9 +88,8 @@ th::Tensor LLaMA::forward(th::Tensor& output_logits, CHECK_CONTIGUOUS(input_lengths); TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); - const int batch_size = input_ids.size(0); - const int seq_len = input_ids.size(1); - ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, max_length); + const int batch_size = input_lengths.size(0); + ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len); return output_logits; } diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 365a07dd1..83c580ce8 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -34,7 +34,8 @@ class IFLLaMA { th::Tensor& input_lengths, th::Tensor& context_lengths, const int num_tokens, - const int max_length) = 0; + const int seq_len, + const int attn_len) = 0; }; template @@ -115,8 +116,6 @@ class FTLLaMA: public IFLLaMA { cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); cublasSetStream(cublasHandle, stream_); - /// ft::Allocator allocator = - // ft::Allocator(at::cuda::getCurrentCUDAStream().device_index()); allocator_ = new ft::Allocator(); cublas_wrapper_ = new ft::cublasMMWrapper( cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_); @@ -176,28 +175,31 @@ class FTLLaMA: public IFLLaMA { th::Tensor& input_lengths, th::Tensor& context_lengths, const int num_tokens, - const int max_length) override + const int seq_len, + const int attn_len) override { - - const size_t batch_size = (size_t)input_ids.size(0); - const size_t seq_len = (size_t)input_ids.size(1); + const size_t batch_size = (size_t)input_lengths.size(0); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", - ft::Tensor{ - ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size, seq_len}, get_ptr(input_ids)}}, + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{batch_size, (size_t)seq_len}, + get_ptr(input_ids)}}, {"input_lengths", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, {"context_lengths", - ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, + ft::Tensor{ + ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &num_tokens}}, - {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &max_length}}}; + {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &seq_len}}, + {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}}; std::unordered_map output_tensors = std::unordered_map{ {"output_logits", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, - std::vector{batch_size, seq_len, vocab_size_}, + std::vector{batch_size, (size_t)seq_len, vocab_size_}, get_ptr(output_logits)}}}; try { @@ -264,12 +266,13 @@ class LLaMA: public th::jit::CustomClassHolder { ~LLaMA(); - th::Tensor forward(th::Tensor& output_logits, - th::Tensor& input_ids, - th::Tensor& input_lengths, - th::Tensor& context_lengths, - const int64_t num_tokens, - const int64_t max_length); + th::Tensor forward(th::Tensor& output_logits, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& context_lengths, + const int64_t num_tokens, + const int64_t seq_len, + const int64_t attn_len); private: const at::ScalarType st_; From 433f2c95032544aaef28a75cff7feca631674902 Mon Sep 17 00:00:00 2001 From: dypshong Date: Fri, 29 Sep 2023 20:28:07 +0000 Subject: [PATCH 42/55] ckpt --- src/fastertransformer/models/llama/LLaMA.cc | 34 +++++++++++++++++++ src/fastertransformer/models/llama/LLaMA.h | 8 +++-- .../models/llama/LLaMAContextDecoder.cc | 33 ++++++------------ .../models/llama/LLaMAContextDecoder.h | 2 -- 4 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index fb71c966f..b9162c041 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -61,6 +61,10 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ FT_LOG_DEBUG(__PRETTY_FUNCTION__); const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_; + padding_offset_ = + reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); + cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false)); @@ -88,6 +92,8 @@ template void LLaMA::freeBuffer() { if (is_allocate_buffer_) { + allocator_->free((void**)(&padding_offset_)); + allocator_->free((void**)(&cu_seqlens_)); allocator_->free((void**)(&input_attention_mask_)); allocator_->free((void**)(&key_cache_)); allocator_->free((void**)(&context_decoder_input_buf_)); @@ -238,6 +244,16 @@ void LLaMA::forward(std::unordered_map* output_ten allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); + if (is_unpadded_mha) { + invokeLLaMAGetPaddingOffsetAndCuSeqLens( + padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); + sync_check_cuda_error(); + + // invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, + // stream_); + // sync_check_cuda_error(); + } + invokeLLaMABuildDecoderAttentionMask( input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_); sync_check_cuda_error(); @@ -252,8 +268,10 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); } else { + ftNcclGroupStart(); ftNcclRecv( context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + ftNcclGroupEnd(); sync_check_cuda_error(); } @@ -268,6 +286,13 @@ void LLaMA::forward(std::unordered_map* output_ten {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}}, {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}}; + if (is_unpadded_mha) { + decoder_input_tensors.insert( + {"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)num_tokens}, padding_offset_}}); + decoder_input_tensors.insert( + {"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}}); + } + std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}}, @@ -286,6 +311,13 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); + + // if (is_unpadded_mha) { + // invokeRebuildPadding( + // decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); + // sync_check_cuda_error(); + // } + if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { buf_no_ = (buf_no_ + 1) % num_buffers_; check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); @@ -296,11 +328,13 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + ftNcclGroupStart(); ftNcclSend(context_decoder_output_buf_clone_[buf_no_], num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_); + ftNcclGroupEnd(); check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 34e8c7ae9..35ff68fd6 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -65,9 +65,11 @@ class LLaMA: public BaseLayer { void initialize(); protected: - T* input_attention_mask_ = nullptr; - T* key_cache_ = nullptr; - T* value_cache_ = nullptr; + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + T* input_attention_mask_ = nullptr; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; T* decoder_output_buf_ = nullptr; T* normed_decoder_output_buf_ = nullptr; diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 0294c58d4..4a257a405 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -62,9 +62,6 @@ void LLaMAContextDecoder::allocateBuffer() template void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) { - padding_offset_ = - reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); - cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -82,8 +79,6 @@ void LLaMAContextDecoder::freeBuffer() allocator_->free((void**)(&decoder_normed_input_)); allocator_->free((void**)(&self_attn_output_)); allocator_->free((void**)(&decoder_layer_output_)); - allocator_->free((void**)(&cu_seqlens_)); - allocator_->free((void**)(&padding_offset_)); is_allocate_buffer_ = false; } } @@ -203,13 +198,15 @@ void LLaMAContextDecoder::forward(std::unordered_map* // num_tokens [1] int on cpu // seq_len [1] int on cpu // attn_len [1] int on cpu + // padding_offset [batch_size] int on cpu + // cu_seqlens [batch_size+1] int on cpu // output tensors: // decoder_output [num_tokens, hidden_dimension], // key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] // value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head] - FT_CHECK(input_tensors->size() == 7); + FT_CHECK(input_tensors->size() >= 7); FT_CHECK(output_tensors->size() == 3); const DataType data_type = getTensorType(); const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); @@ -219,6 +216,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* const int num_tokens = input_tensors->at("num_tokens").getVal(); const int seq_len = input_tensors->at("attention_mask").shape[2]; const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int* padding_offset = nullptr; + const int* cu_seqlens = nullptr; + if (is_unpadded_mha) { + padding_offset = input_tensors->at("padding_offset").getPtr(); + cu_seqlens = input_tensors->at("cu_seqlens").getPtr(); + } const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; allocateBuffer(batch_size, seq_len, max_seq_len); @@ -243,15 +246,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* size_t h_token_num = batch_size * seq_len; if (is_unpadded_mha) { - invokeLLaMAGetPaddingOffsetAndCuSeqLens( - padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); - sync_check_cuda_error(); - h_token_num = num_tokens; - - // invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, - // stream_); - // sync_check_cuda_error(); } for (int l = 0; l < num_layer_; l++) { @@ -296,9 +291,9 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", - Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_}); + Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset}); self_attention_input_tensors.insert("cu_seqlens", - Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}); + Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens}); } size_t cache_offset = l - getFirstLayerParallelId(); @@ -346,12 +341,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* sync_check_cuda_error(); } - // if (is_unpadded_mha) { - // invokeRebuildPadding( - // decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); - // sync_check_cuda_error(); - // } - if (is_free_buffer_after_forward_ == true) { freeBuffer(); } diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index d76ff0687..eb4e64ef0 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -67,8 +67,6 @@ class LLaMAContextDecoder: public BaseLayer { void initialize(); protected: - int* padding_offset_ = nullptr; - int* cu_seqlens_ = nullptr; T* decoder_normed_input_ = nullptr; T* self_attn_output_ = nullptr; T* decoder_layer_output_ = nullptr; From b63b4969db2059e3790c13cf87c5f4874230fc36 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 00:03:50 +0000 Subject: [PATCH 43/55] ckpt --- .../kernels/llama_kernels.cu | 107 +++++++++++++++++- src/fastertransformer/kernels/llama_kernels.h | 12 ++ src/fastertransformer/models/llama/LLaMA.cc | 60 +++++----- src/fastertransformer/models/llama/LLaMA.h | 7 +- src/fastertransformer/th_op/llama/LLaMA.cc | 21 ++-- src/fastertransformer/th_op/llama/LLaMA.h | 38 +++++-- 6 files changed, 187 insertions(+), 58 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 95700cb1f..4110819a8 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -1,4 +1,5 @@ #include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/fastertransformer/utils/cuda_fp8_utils.h" #include @@ -10,6 +11,111 @@ using namespace std; namespace fastertransformer { +__global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size) +{ + // logits [T, V] + // out [T, V] + const int64_t ti = blockIdx.x; + __shared__ float s_sum, s_max; + + if (ti >= num_tokens) + return; + + float local_max = -1e20f; + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + local_max = fmax(logit_val, local_max); + } + + float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax(local_max); + if (threadIdx.x == 0) { + s_max = max_val; + } + __syncthreads(); + + float local_sum = 0; + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + local_sum += __expf(logit_val - s_max); + } + float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum(local_sum); + if (threadIdx.x == 0) { + // s_sum = sum_val + 1e-6f; + s_sum = sum_val; + } + __syncthreads(); + + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + out[ti * vocab_size + i] = (logit_val - s_max) - __logf(s_sum); + } +} + +void invokeLLaMALogSoftmax( + float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream) +{ + dim3 grid(num_tokens); + dim3 block(min(1024, vocab_size)); + LLaMA_log_softmax<<>>(out, logits, num_tokens, vocab_size); +} + +__global__ void LLaMA_gather_tokens_kernel(float* out, + const float* probs, + const int* input_ids, + const int* input_lengths, + const int* cu_seqlens, + const int batch_size, + const int vocab_size) +{ + /* + // probs: [T, V] + // input_ids: [T] + int batch_idx = blockIdx.x; + + if (batch_idx >= batch_size) + return; + + float val = 0.f; + // for (int i = cu_seqlens[batch_idx] + threadIdx.x; i < cu_seqlens[batch_idx + 1] - 1; i += blockDim.x) { + // int input_idx = input_ids[i + 1]; + // val += probs[i * vocab_size + input_idx]; + // } + for (int t = cu_seqlens[batch_idx]; t < cu_seqlens[batch_idx + 1] - 1; ++t) { + val += probs[t * vocab_size + input_ids[t + 1]]; + } + //float sum = blockReduceSum(val); + + if (threadIdx.x == 0) + out[batch_idx] = val; + */ + // for b in range(bsz): + // for i in range(choice_seq_lens_list[c][b]-1): + // t = choice_cum_seq_lens_list[c][b] + i + // choice_log_probs[b, c] = choice_log_probs[b, c] + log_likelihoods[t, choice_tokens_list[c][t+1]] + + for (int b = 0; b < batch_size; ++b) { + float val = 0.f; + for (int i = 0; i < input_lengths[b] - 1; ++i) { + int t = cu_seqlens[b] + i; + val += probs[t * vocab_size + input_ids[t + 1]]; + } + out[b] = val; + } +} + +void invokeLLaMAGatherTokens(float* out, + const float* probs, + const int* input_ids, + const int* input_lengths, + const int* cu_seqlens, + const int batch_size, + const int vocab_size, + cudaStream_t stream) +{ + LLaMA_gather_tokens_kernel<<<1, 1, 0, stream>>>( + out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size); +} + template __global__ void LLaMAstart_id_embedding_lookups_kernel( T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units) @@ -145,7 +251,6 @@ template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, template __global__ void LLaMACopyKernel(T* dst, T* src, const int count) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; if (idx * X_ELEMS >= count) { diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index 2d1c9592e..66488462d 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -29,4 +29,16 @@ void invokeLLaMAInputIdsEmbeddingLookup(T* from_tensor, template void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); + +void invokeLLaMAGatherTokens(float* out, + const float* probs, + const int* input_ids, + const int* input_lengths, + const int* cu_seqlens, + const int batch_size, + const int vocab_size, + cudaStream_t stream); + +void invokeLLaMALogSoftmax( + float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream); } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index b9162c041..854df9de6 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -83,7 +83,10 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); - logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false)); + logits_buf_ = + (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); + log_likelihood_buf_ = + (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); is_allocate_buffer_ = true; } @@ -225,7 +228,9 @@ void LLaMA::forward(std::unordered_map* output_ten // attn_len [1] int on cpu // output_tensors: - // output_logits [num_tokens, vocab_size] + // hidden_vector [num_tokens, hidden_size] + // log_probs [num_tokens, vocab_size] + // out_log_probs [batch_size] FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); @@ -240,6 +245,9 @@ void LLaMA::forward(std::unordered_map* output_ten const int num_tokens = input_tensors->at("num_tokens").getVal(); const int seq_len = input_tensors->at("seq_len").getVal(); const int attn_len = input_tensors->at("attn_len").getVal(); + T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); + float* log_probs = output_tensors->at("log_probs").getPtr(); + float* out_log_probs = output_tensors->at("out_log_probs").getPtr(); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); @@ -277,7 +285,10 @@ void LLaMA::forward(std::unordered_map* output_ten std::unordered_map decoder_input_tensors{ {"decoder_input", - Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_input_buf_}}, + Tensor{MEMORY_GPU, + data_type, + {(size_t)num_tokens, hidden_units_}, + (pipeline_para_.rank_ == 0) ? context_decoder_input_buf_ : hidden_vector}}, {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, @@ -294,8 +305,7 @@ void LLaMA::forward(std::unordered_map* output_ten } std::unordered_map decoder_output_tensors{ - {"decoder_output", - Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}}, + {"decoder_output", Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, hidden_vector}}, {"key_cache", Tensor{MEMORY_GPU, data_type, @@ -311,7 +321,6 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - // if (is_unpadded_mha) { // invokeRebuildPadding( // decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); @@ -321,10 +330,8 @@ void LLaMA::forward(std::unordered_map* output_ten if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { buf_no_ = (buf_no_ + 1) % num_buffers_; check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); - invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], - context_decoder_output_buf_, - num_tokens * hidden_units_, - stream_); + invokeLLaMACopyKernel( + context_decoder_output_buf_clone_[buf_no_], hidden_vector, num_tokens * hidden_units_, stream_); sync_check_cuda_error(); check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); @@ -340,7 +347,7 @@ void LLaMA::forward(std::unordered_map* output_ten } else { invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, - context_decoder_output_buf_, + hidden_vector, llama_weights->post_decoder_layernorm.gamma, layernorm_eps_, num_tokens, @@ -348,9 +355,8 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); - float alpha = 1.0f; - float beta = 0.0f; - float* output_logits = output_tensors->at("output_logits").getPtr(); + float alpha = 1.0f; + float beta = 0.0f; cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F); cublas_wrapper_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, @@ -361,29 +367,17 @@ void LLaMA::forward(std::unordered_map* output_ten vocab_size_, normed_decoder_output_buf_, hidden_units_, // n - output_logits, + logits_buf_, vocab_size_); sync_check_cuda_error(); cublas_wrapper_->setFP16GemmConfig(); - // cublas_wrapper_->Gemm(CUBLAS_OP_N, - // CUBLAS_OP_N, - // vocab_size_, - // num_tokens, - // hidden_units_, - // llama_weights->post_decoder_embedding.kernel, - // vocab_size_, - // normed_decoder_output_buf_, - // hidden_units_, // n - // logits_buf_, - // vocab_size_); - // sync_check_cuda_error(); - // - // if (std::is_same::value) { - // float* output_logits = output_tensors->at("output_logits").getPtr(); - // invokeCudaCast(output_logits, logits_buf_, num_tokens * vocab_size_, stream_); - // sync_check_cuda_error(); - // } + invokeLLaMALogSoftmax(log_probs, logits_buf_, num_tokens, vocab_size_, stream_); + sync_check_cuda_error(); + + invokeLLaMAGatherTokens( + out_log_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_); + sync_check_cuda_error(); } } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 35ff68fd6..5f582510d 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -71,9 +71,10 @@ class LLaMA: public BaseLayer { T* key_cache_ = nullptr; T* value_cache_ = nullptr; - T* decoder_output_buf_ = nullptr; - T* normed_decoder_output_buf_ = nullptr; - T* logits_buf_ = nullptr; + T* decoder_output_buf_ = nullptr; + T* normed_decoder_output_buf_ = nullptr; + float* logits_buf_ = nullptr; + float* log_likelihood_buf_ = nullptr; T* context_decoder_input_buf_ = nullptr; T* context_decoder_output_buf_ = nullptr; diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 1e260eec6..3849e7c27 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -73,13 +73,15 @@ LLaMA::~LLaMA() delete ftllama; } -th::Tensor LLaMA::forward(th::Tensor& output_logits, - th::Tensor& input_ids, - th::Tensor& input_lengths, - th::Tensor& context_lengths, - const int64_t num_tokens, - const int64_t seq_len, - const int64_t attn_len) +std::vector LLaMA::forward(th::Tensor& hidden_vector, + th::Tensor& log_probs, + th::Tensor& out_log_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& context_lengths, + const int64_t num_tokens, + const int64_t seq_len, + const int64_t attn_len) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -89,8 +91,9 @@ th::Tensor LLaMA::forward(th::Tensor& output_logits, TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); const int batch_size = input_lengths.size(0); - ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len); - return output_logits; + ftllama->forward( + hidden_vector, log_probs, out_log_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len); + return std::vector{hidden_vector, log_probs, out_log_probs}; } } // namespace torch_ext diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 83c580ce8..a853f0818 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -29,7 +29,9 @@ using std::vector; class IFLLaMA { public: virtual ~IFLLaMA() {} - virtual void forward(th::Tensor& output_logits, + virtual void forward(th::Tensor& hidden_vector, + th::Tensor& log_probs, + th::Tensor& out_log_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, @@ -170,7 +172,9 @@ class FTLLaMA: public IFLLaMA { delete cublas_wrapper_mutex_; } - virtual void forward(th::Tensor& output_logits, + virtual void forward(th::Tensor& hidden_vector, + th::Tensor& log_probs, + th::Tensor& out_log_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, @@ -196,11 +200,19 @@ class FTLLaMA: public IFLLaMA { {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}}; std::unordered_map output_tensors = std::unordered_map{ - {"output_logits", + {"hidden_vector", + ft::Tensor{ft::MEMORY_GPU, + (std::is_same::value) ? ft::TYPE_FP16 : ft::TYPE_FP32, + std::vector{(size_t)num_tokens, num_heads_ * size_per_head_}, + get_ptr(hidden_vector)}}, + {"log_probs", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, - std::vector{batch_size, (size_t)seq_len, vocab_size_}, - get_ptr(output_logits)}}}; + std::vector{(size_t)num_tokens, vocab_size_}, + get_ptr(log_probs)}}, + {"out_log_probs", + ft::Tensor{ + ft::MEMORY_GPU, ft::TYPE_FP32, std::vector{batch_size}, get_ptr(out_log_probs)}}}; try { ft::check_cuda_error(cudaEventSynchronize(event_)); @@ -266,13 +278,15 @@ class LLaMA: public th::jit::CustomClassHolder { ~LLaMA(); - th::Tensor forward(th::Tensor& output_logits, - th::Tensor& input_ids, - th::Tensor& input_lengths, - th::Tensor& context_lengths, - const int64_t num_tokens, - const int64_t seq_len, - const int64_t attn_len); + std::vector forward(th::Tensor& hidden_vector, + th::Tensor& log_probs, + th::Tensor& out_log_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& context_lengths, + const int64_t num_tokens, + const int64_t seq_len, + const int64_t attn_len); private: const at::ScalarType st_; From 6ee6105f1ae3b0f8a63b5fee4bc88edfb281c714 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 19:40:03 +0000 Subject: [PATCH 44/55] 08:42 --- .../kernels/llama_kernels.cu | 24 ++++--- src/fastertransformer/models/llama/LLaMA.cc | 65 +++++++++++-------- src/fastertransformer/models/llama/LLaMA.h | 2 +- src/fastertransformer/th_op/llama/LLaMA.cc | 10 +-- src/fastertransformer/th_op/llama/LLaMA.h | 23 ++++--- 5 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 4110819a8..0184bc0d6 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -93,14 +93,22 @@ __global__ void LLaMA_gather_tokens_kernel(float* out, // t = choice_cum_seq_lens_list[c][b] + i // choice_log_probs[b, c] = choice_log_probs[b, c] + log_likelihoods[t, choice_tokens_list[c][t+1]] - for (int b = 0; b < batch_size; ++b) { - float val = 0.f; - for (int i = 0; i < input_lengths[b] - 1; ++i) { - int t = cu_seqlens[b] + i; - val += probs[t * vocab_size + input_ids[t + 1]]; - } - out[b] = val; + // probs: [T, V] + // input_ids: [T] + int batch_idx = blockIdx.x; + + if (batch_idx >= batch_size) + return; + + float val = 0.f; + for (int i = threadIdx.x; i < input_lengths[batch_idx] - 1; i += blockDim.x) { + int t = cu_seqlens[batch_idx] + i; + val += probs[t * vocab_size + input_ids[t + 1]]; } + float sum = blockReduceSum(val); + + if (threadIdx.x == 0) + out[batch_idx] = sum; } void invokeLLaMAGatherTokens(float* out, @@ -112,7 +120,7 @@ void invokeLLaMAGatherTokens(float* out, const int vocab_size, cudaStream_t stream) { - LLaMA_gather_tokens_kernel<<<1, 1, 0, stream>>>( + LLaMA_gather_tokens_kernel<<>>( out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size); } diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 854df9de6..6ab73fb66 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -226,11 +226,12 @@ void LLaMA::forward(std::unordered_map* output_ten // num_tokens [1] int on cpu // seq_len [1] int on cpu // attn_len [1] int on cpu + // is_context [1] int on cpu // output_tensors: // hidden_vector [num_tokens, hidden_size] // log_probs [num_tokens, vocab_size] - // out_log_probs [batch_size] + // cum_probs [batch_size] FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); @@ -245,9 +246,10 @@ void LLaMA::forward(std::unordered_map* output_ten const int num_tokens = input_tensors->at("num_tokens").getVal(); const int seq_len = input_tensors->at("seq_len").getVal(); const int attn_len = input_tensors->at("attn_len").getVal(); + const int is_context = input_tensors->at("is_context").getVal(); T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); float* log_probs = output_tensors->at("log_probs").getPtr(); - float* out_log_probs = output_tensors->at("out_log_probs").getPtr(); + float* cum_probs = output_tensors->at("cum_probs").getPtr(); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); @@ -276,11 +278,12 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); } else { - ftNcclGroupStart(); - ftNcclRecv( - context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - ftNcclGroupEnd(); - sync_check_cuda_error(); + // ftNcclGroupStart(); + // ftNcclRecv( + // context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, + // stream_); + // ftNcclGroupEnd(); + // sync_check_cuda_error(); } std::unordered_map decoder_input_tensors{ @@ -288,7 +291,7 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, - (pipeline_para_.rank_ == 0) ? context_decoder_input_buf_ : hidden_vector}}, + pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector}}, {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, @@ -305,7 +308,12 @@ void LLaMA::forward(std::unordered_map* output_ten } std::unordered_map decoder_output_tensors{ - {"decoder_output", Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, hidden_vector}}, + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {(size_t)num_tokens, hidden_units_}, + (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : + hidden_vector}}, {"key_cache", Tensor{MEMORY_GPU, data_type, @@ -328,26 +336,27 @@ void LLaMA::forward(std::unordered_map* output_ten // } if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { - buf_no_ = (buf_no_ + 1) % num_buffers_; - check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); - invokeLLaMACopyKernel( - context_decoder_output_buf_clone_[buf_no_], hidden_vector, num_tokens * hidden_units_, stream_); - sync_check_cuda_error(); - check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); - check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); - ftNcclGroupStart(); - ftNcclSend(context_decoder_output_buf_clone_[buf_no_], - num_tokens * hidden_units_, - pipeline_para_.rank_ + 1, - pipeline_para_, - comm_stream_); - ftNcclGroupEnd(); - check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); - sync_check_cuda_error(); + // buf_no_ = (buf_no_ + 1) % num_buffers_; + // check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); + // invokeLLaMACopyKernel( + // context_decoder_output_buf_clone_[buf_no_], context_decoder_output_buf_, num_tokens * + // hidden_units_, stream_); + // sync_check_cuda_error(); + // check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); + // check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + // ftNcclGroupStart(); + // ftNcclSend(context_decoder_output_buf_clone_[buf_no_], + // num_tokens * hidden_units_, + // pipeline_para_.rank_ + 1, + // pipeline_para_, + // comm_stream_); + // ftNcclGroupEnd(); + // check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); + // sync_check_cuda_error(); } - else { + else if (!is_context){ invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, - hidden_vector, + context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, layernorm_eps_, num_tokens, @@ -376,7 +385,7 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); invokeLLaMAGatherTokens( - out_log_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_); + cum_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_); sync_check_cuda_error(); } } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 5f582510d..f4143ee39 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -38,7 +38,7 @@ class LLaMA: public BaseLayer { size_t random_seed_; size_t max_seq_len_; - static constexpr int num_buffers_ = 5; + static constexpr int num_buffers_ = 10; int buf_no_ = 0; cudaStream_t comm_stream_; cudaEvent_t kern_event_[num_buffers_]; diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 3849e7c27..b098f28f7 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -75,13 +75,15 @@ LLaMA::~LLaMA() std::vector LLaMA::forward(th::Tensor& hidden_vector, th::Tensor& log_probs, - th::Tensor& out_log_probs, + th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, const int64_t num_tokens, const int64_t seq_len, - const int64_t attn_len) + const int64_t attn_len, + const int64_t is_context + ) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -92,8 +94,8 @@ std::vector LLaMA::forward(th::Tensor& hidden_vector, const int batch_size = input_lengths.size(0); ftllama->forward( - hidden_vector, log_probs, out_log_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len); - return std::vector{hidden_vector, log_probs, out_log_probs}; + hidden_vector, log_probs, cum_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len, is_context); + return std::vector{hidden_vector, log_probs, cum_probs}; } } // namespace torch_ext diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index a853f0818..ff2caa238 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -31,13 +31,14 @@ class IFLLaMA { virtual ~IFLLaMA() {} virtual void forward(th::Tensor& hidden_vector, th::Tensor& log_probs, - th::Tensor& out_log_probs, + th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, const int num_tokens, const int seq_len, - const int attn_len) = 0; + const int attn_len, + const int is_context) = 0; }; template @@ -174,13 +175,14 @@ class FTLLaMA: public IFLLaMA { virtual void forward(th::Tensor& hidden_vector, th::Tensor& log_probs, - th::Tensor& out_log_probs, + th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, const int num_tokens, const int seq_len, - const int attn_len) override + const int attn_len, + const int is_context) override { const size_t batch_size = (size_t)input_lengths.size(0); @@ -197,7 +199,8 @@ class FTLLaMA: public IFLLaMA { ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &num_tokens}}, {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &seq_len}}, - {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}}; + {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}, + {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &is_context}}}; std::unordered_map output_tensors = std::unordered_map{ {"hidden_vector", @@ -210,9 +213,8 @@ class FTLLaMA: public IFLLaMA { ft::TYPE_FP32, std::vector{(size_t)num_tokens, vocab_size_}, get_ptr(log_probs)}}, - {"out_log_probs", - ft::Tensor{ - ft::MEMORY_GPU, ft::TYPE_FP32, std::vector{batch_size}, get_ptr(out_log_probs)}}}; + {"cum_probs", + ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector{batch_size}, get_ptr(cum_probs)}}}; try { ft::check_cuda_error(cudaEventSynchronize(event_)); @@ -280,13 +282,14 @@ class LLaMA: public th::jit::CustomClassHolder { std::vector forward(th::Tensor& hidden_vector, th::Tensor& log_probs, - th::Tensor& out_log_probs, + th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, th::Tensor& context_lengths, const int64_t num_tokens, const int64_t seq_len, - const int64_t attn_len); + const int64_t attn_len, + const int64_t is_context); private: const at::ScalarType st_; From 1955508c793280e482b41fe03ea708ef4c3a41c6 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 19:47:23 +0000 Subject: [PATCH 45/55] # input check bug fix --- src/fastertransformer/models/llama/LLaMA.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 6ab73fb66..14e7971f9 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -233,7 +233,7 @@ void LLaMA::forward(std::unordered_map* output_ten // log_probs [num_tokens, vocab_size] // cum_probs [batch_size] - FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6"); + FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 6"); FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); From 57dded422703f7bcf789dd88ff93ebe3be74b74a Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 20:45:02 +0000 Subject: [PATCH 46/55] code rf --- .../attention_layers/LLaMAContextAttentionLayer.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 0c2307fc8..c8777c4cc 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -85,11 +85,12 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten 3 * hidden_units_ /* n */); sync_check_cuda_error(); - if (padding_offset != nullptr) { - // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_); - sync_check_cuda_error(); - } +// if (padding_offset != nullptr) { +// // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous +// cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_); +// sync_check_cuda_error(); +// } + invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, From d3a83aac7fabf702307faefc038b191ad76412de Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 21:30:14 +0000 Subject: [PATCH 47/55] add macro --- src/fastertransformer/models/llama/LLaMA.cc | 86 ++++++++++++--------- src/fastertransformer/models/llama/LLaMA.h | 8 +- 2 files changed, 55 insertions(+), 39 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 14e7971f9..8ab1b1a6e 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -29,11 +29,14 @@ namespace fastertransformer { template void LLaMA::initialize() { +#ifdef USE_NCCL check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking)); for (int i = 0; i < num_buffers_; ++i) { check_cuda_error(cudaEventCreate(&kern_event_[i])); check_cuda_error(cudaEventCreate(&comm_event_[i])); } +#endif + llama_context_decoder_ = new LLaMAContextDecoder(head_num_, size_per_head_, inter_size_, @@ -76,10 +79,12 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); +#ifdef USE_NCCL for (int i = 0; i < num_buffers_; ++i) { context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc( context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); } +#endif normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -101,9 +106,11 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&key_cache_)); allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); +#ifdef USE_NCCL for (int i = 0; i < num_buffers_; ++i) { allocator_->free((void**)(&context_decoder_output_buf_clone_[i])); } +#endif allocator_->free((void**)(&logits_buf_)); is_allocate_buffer_ = false; } @@ -195,11 +202,13 @@ LLaMA::LLaMA(LLaMA const& llama): template LLaMA::~LLaMA() { +#ifdef USE_NCCL check_cuda_error(cudaStreamDestroy(comm_stream_)); for (int i = 0; i < num_buffers_; ++i) { check_cuda_error(cudaEventDestroy(kern_event_[i])); check_cuda_error(cudaEventDestroy(comm_event_[i])); } +#endif delete llama_context_decoder_; freeBuffer(); @@ -258,10 +267,6 @@ void LLaMA::forward(std::unordered_map* output_ten invokeLLaMAGetPaddingOffsetAndCuSeqLens( padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); sync_check_cuda_error(); - - // invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, - // stream_); - // sync_check_cuda_error(); } invokeLLaMABuildDecoderAttentionMask( @@ -278,12 +283,13 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); } else { - // ftNcclGroupStart(); - // ftNcclRecv( - // context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, - // stream_); - // ftNcclGroupEnd(); - // sync_check_cuda_error(); +#ifdef USE_NCCL + ftNcclGroupStart(); + ftNcclRecv( + context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); + ftNcclGroupEnd(); + sync_check_cuda_error(); +#endif } std::unordered_map decoder_input_tensors{ @@ -291,7 +297,12 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, - pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector}}, +#ifdef USE_NCCL + context_decoder_input_buf_ +#else + pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector +#endif + }}, {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, @@ -312,8 +323,12 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, - (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : - hidden_vector}}, +#ifdef USE_NCCL + context_decoder_output_buf_ +#else + (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector +#endif + }}, {"key_cache", Tensor{MEMORY_GPU, data_type, @@ -329,32 +344,29 @@ void LLaMA::forward(std::unordered_map* output_ten &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - // if (is_unpadded_mha) { - // invokeRebuildPadding( - // decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_); - // sync_check_cuda_error(); - // } - if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { - // buf_no_ = (buf_no_ + 1) % num_buffers_; - // check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); - // invokeLLaMACopyKernel( - // context_decoder_output_buf_clone_[buf_no_], context_decoder_output_buf_, num_tokens * - // hidden_units_, stream_); - // sync_check_cuda_error(); - // check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); - // check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); - // ftNcclGroupStart(); - // ftNcclSend(context_decoder_output_buf_clone_[buf_no_], - // num_tokens * hidden_units_, - // pipeline_para_.rank_ + 1, - // pipeline_para_, - // comm_stream_); - // ftNcclGroupEnd(); - // check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); - // sync_check_cuda_error(); +#ifdef USE_NCCL + buf_no_ = (buf_no_ + 1) % num_buffers_; + check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); + invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], + context_decoder_output_buf_, + num_tokens * hidden_units_, + stream_); + sync_check_cuda_error(); + check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); + check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + ftNcclGroupStart(); + ftNcclSend(context_decoder_output_buf_clone_[buf_no_], + num_tokens * hidden_units_, + pipeline_para_.rank_ + 1, + pipeline_para_, + comm_stream_); + ftNcclGroupEnd(); + check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); + sync_check_cuda_error(); +#endif } - else if (!is_context){ + else if (!is_context) { invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index f4143ee39..c5cb1c233 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -23,6 +23,8 @@ #include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" +#define USE_NCCL + namespace fastertransformer { template @@ -38,11 +40,14 @@ class LLaMA: public BaseLayer { size_t random_seed_; size_t max_seq_len_; - static constexpr int num_buffers_ = 10; +#ifdef USE_NCCL + static constexpr int num_buffers_ = 5; int buf_no_ = 0; cudaStream_t comm_stream_; cudaEvent_t kern_event_[num_buffers_]; cudaEvent_t comm_event_[num_buffers_]; + T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; +#endif static constexpr float layernorm_eps_ = 1e-6f; @@ -78,7 +83,6 @@ class LLaMA: public BaseLayer { T* context_decoder_input_buf_ = nullptr; T* context_decoder_output_buf_ = nullptr; - T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); From 1365462163606f06ac007a4e5f988c0b147d66b2 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 22:22:52 +0000 Subject: [PATCH 48/55] 07_03 --- src/fastertransformer/models/llama/LLaMA.cc | 39 +++++++++++---------- src/fastertransformer/models/llama/LLaMA.h | 2 +- src/fastertransformer/th_op/llama/LLaMA.h | 3 +- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 8ab1b1a6e..b55e04149 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -284,10 +284,8 @@ void LLaMA::forward(std::unordered_map* output_ten } else { #ifdef USE_NCCL - ftNcclGroupStart(); ftNcclRecv( context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - ftNcclGroupEnd(); sync_check_cuda_error(); #endif } @@ -346,24 +344,29 @@ void LLaMA::forward(std::unordered_map* output_ten if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { #ifdef USE_NCCL - buf_no_ = (buf_no_ + 1) % num_buffers_; - check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); - invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], - context_decoder_output_buf_, - num_tokens * hidden_units_, - stream_); - sync_check_cuda_error(); - check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); - check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + // buf_no_ = (buf_no_ + 1) % num_buffers_; + // check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); + // invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], + // context_decoder_output_buf_, + // num_tokens * hidden_units_, + // stream_); + // sync_check_cuda_error(); + // check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); + // check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); + // ftNcclGroupStart(); + // ftNcclSend(context_decoder_output_buf_clone_[buf_no_], + // num_tokens * hidden_units_, + // pipeline_para_.rank_ + 1, + // pipeline_para_, + // comm_stream_); + // ftNcclGroupEnd(); + // check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); + // sync_check_cuda_error(); + ftNcclGroupStart(); - ftNcclSend(context_decoder_output_buf_clone_[buf_no_], - num_tokens * hidden_units_, - pipeline_para_.rank_ + 1, - pipeline_para_, - comm_stream_); + ftNcclSend( + context_decoder_output_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, stream_); ftNcclGroupEnd(); - check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); - sync_check_cuda_error(); #endif } else if (!is_context) { diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index c5cb1c233..7bb73c524 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -23,7 +23,7 @@ #include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" -#define USE_NCCL +//#define USE_NCCL namespace fastertransformer { diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index ff2caa238..3677515fb 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -113,7 +113,8 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); - ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + //ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + ft::check_cuda_error(cudaStreamCreate(&stream_)); ft::check_cuda_error(cudaEventCreate(&event_)); cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); From 305540f6510099dd544b295200ca664df9c5c051 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sat, 30 Sep 2023 22:54:55 +0000 Subject: [PATCH 49/55] add multiple devent --- src/fastertransformer/th_op/llama/LLaMA.h | 24 +++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 3677515fb..f5ffee6f7 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -113,9 +113,12 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); - //ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + // ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); ft::check_cuda_error(cudaStreamCreate(&stream_)); - ft::check_cuda_error(cudaEventCreate(&event_)); + + for (int i = 0; i < num_events_; ++i) { + ft::check_cuda_error(cudaEventCreate(&event_[i])); + } cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); cublasSetStream(cublasHandle, stream_); @@ -160,7 +163,9 @@ class FTLLaMA: public IFLLaMA { ~FTLLaMA() override { - ft::check_cuda_error(cudaEventDestroy(event_)); + for (int i = 0; i < num_events_; ++i) { + ft::check_cuda_error(cudaEventDestroy(event_[i])); + } ft::check_cuda_error(cudaStreamDestroy(stream_)); delete llama_; @@ -218,12 +223,13 @@ class FTLLaMA: public IFLLaMA { ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector{batch_size}, get_ptr(cum_probs)}}}; try { - ft::check_cuda_error(cudaEventSynchronize(event_)); + ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_])); llama_->forward(&output_tensors, &input_tensors, &llama_weights_); - ft::check_cuda_error(cudaEventRecord(event_, stream_)); + ft::check_cuda_error(cudaEventRecord(event_[ev_no_], stream_)); auto stream = at::cuda::getCurrentCUDAStream().stream(); - ft::check_cuda_error(cudaStreamWaitEvent(stream, event_)); + ft::check_cuda_error(cudaStreamWaitEvent(stream, event_[ev_no_])); + ev_no_ = (ev_no_ + 1) % num_events_; } catch (std::runtime_error& error) { std::cout << error.what(); @@ -247,8 +253,10 @@ class FTLLaMA: public IFLLaMA { int64_t tensor_para_size_; int64_t pipeline_para_size_; - cudaStream_t stream_; - cudaEvent_t event_; + static constexpr int num_events_ = 5; + int ev_no_ = 0; + cudaEvent_t event_[num_events_]; + cudaStream_t stream_; std::vector weights_; cublasLtHandle_t cublasltHandle_; From 294a6fc91089289d8aca50d0ed1c76a3df673ee7 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 1 Oct 2023 01:47:34 +0000 Subject: [PATCH 50/55] ft_llama-06_48 --- .../kernels/llama_kernels.cu | 114 ++++++++++++------ src/fastertransformer/kernels/llama_kernels.h | 17 ++- src/fastertransformer/models/llama/LLaMA.cc | 89 ++++++++++---- src/fastertransformer/models/llama/LLaMA.h | 7 +- .../models/llama/LLaMAContextDecoder.cc | 48 +++----- src/fastertransformer/th_op/llama/LLaMA.cc | 18 ++- src/fastertransformer/th_op/llama/LLaMA.h | 28 +++-- 7 files changed, 214 insertions(+), 107 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index 0184bc0d6..d007350fe 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -11,6 +11,70 @@ using namespace std; namespace fastertransformer { +template +__global__ void LLaMA_get_last_tokens(T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size) +{ + // in [num_tokens, hidden_size] + // out [batch_size, hidden_size] + int batch_idx = blockIdx.x; + + if (batch_idx >= batch_size) + return; + + int pos = cu_seqlens[batch_idx + 1] - 1; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + out[batch_idx * hidden_size + idx] = in[pos * hidden_size + idx]; + } +} + +template +void invokeLLaMAGetLastTokens( + T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream) +{ + dim3 grid(batch_size); + dim3 block(256); + LLaMA_get_last_tokens<<>>(out, in, cu_seqlens, batch_size, hidden_size); +} + +template void invokeLLaMAGetLastTokens( + float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); +template void invokeLLaMAGetLastTokens( + half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); + +__global__ void LLaMA_extract_targets( + float* out, float* in, const int* target_ids, const int* cu_seqlens, int beam_width, int batch_size, int vocab_size, int num_tokens) +{ + // in [batch_size, vocab_size] + // target_ids [ beam_width, num_tokens ] + // out [beam_width, batch_size] + int batch_idx = blockIdx.x * blockDim.x + threadIdx.x; + int beam_idx = blockIdx.y * blockDim.y + threadIdx.y; + + if (batch_idx >= batch_size || beam_idx >= beam_width) + return; + + int pos = cu_seqlens[batch_idx + 1] - 1; + int target_idx = target_ids[beam_idx * num_tokens + pos]; + out[beam_idx * batch_size + batch_idx] = in[batch_idx * vocab_size + target_idx]; +} + +void invokeLLaMAExtractTargets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens, + cudaStream_t stream) +{ + dim3 block(32, 4); + dim3 grid((batch_size + block.x - 1) / block.x, (beam_width + block.y - 1) / block.y); + LLaMA_extract_targets<<>>( + out, in, target_ids, cu_seqlens, beam_width, batch_size, vocab_size, num_tokens); +} + __global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size) { // logits [T, V] @@ -61,49 +125,26 @@ void invokeLLaMALogSoftmax( __global__ void LLaMA_gather_tokens_kernel(float* out, const float* probs, - const int* input_ids, const int* input_lengths, + const int* target_ids, const int* cu_seqlens, const int batch_size, - const int vocab_size) + const int vocab_size, + const int num_tokens) { - /* - // probs: [T, V] - // input_ids: [T] - int batch_idx = blockIdx.x; - - if (batch_idx >= batch_size) - return; - - float val = 0.f; - // for (int i = cu_seqlens[batch_idx] + threadIdx.x; i < cu_seqlens[batch_idx + 1] - 1; i += blockDim.x) { - // int input_idx = input_ids[i + 1]; - // val += probs[i * vocab_size + input_idx]; - // } - for (int t = cu_seqlens[batch_idx]; t < cu_seqlens[batch_idx + 1] - 1; ++t) { - val += probs[t * vocab_size + input_ids[t + 1]]; - } - //float sum = blockReduceSum(val); - - if (threadIdx.x == 0) - out[batch_idx] = val; - */ - // for b in range(bsz): - // for i in range(choice_seq_lens_list[c][b]-1): - // t = choice_cum_seq_lens_list[c][b] + i - // choice_log_probs[b, c] = choice_log_probs[b, c] + log_likelihoods[t, choice_tokens_list[c][t+1]] - // probs: [T, V] - // input_ids: [T] + // target_ids: [T] + // out: [batch_size] int batch_idx = blockIdx.x; if (batch_idx >= batch_size) return; float val = 0.f; - for (int i = threadIdx.x; i < input_lengths[batch_idx] - 1; i += blockDim.x) { - int t = cu_seqlens[batch_idx] + i; - val += probs[t * vocab_size + input_ids[t + 1]]; + for (int i = threadIdx.x; i < input_lengths[batch_idx]; i += blockDim.x) { + int pos = cu_seqlens[batch_idx] + i; + int target_pos = target_ids[pos]; + val += (target_pos > 0) ? probs[pos * vocab_size + target_pos] : 0.f; } float sum = blockReduceSum(val); @@ -113,15 +154,18 @@ __global__ void LLaMA_gather_tokens_kernel(float* out, void invokeLLaMAGatherTokens(float* out, const float* probs, - const int* input_ids, const int* input_lengths, + const int* target_ids, const int* cu_seqlens, const int batch_size, const int vocab_size, + const int num_tokens, cudaStream_t stream) { - LLaMA_gather_tokens_kernel<<>>( - out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size); + dim3 grid(batch_size); + dim3 block(256); + LLaMA_gather_tokens_kernel<<>>( + out, probs, input_lengths, target_ids, cu_seqlens, batch_size, vocab_size, num_tokens); } template diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index 66488462d..754ed6bba 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -32,13 +32,28 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream) void invokeLLaMAGatherTokens(float* out, const float* probs, - const int* input_ids, const int* input_lengths, + const int* target_ids, const int* cu_seqlens, const int batch_size, const int vocab_size, + const int num_tokens, cudaStream_t stream); void invokeLLaMALogSoftmax( float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream); + +template +void invokeLLaMAGetLastTokens( + T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); + +void invokeLLaMAExtractTargets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens, + cudaStream_t stream); } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index b55e04149..f0f6d4697 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -86,6 +86,8 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ } #endif + context_output_buf_ = + (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); logits_buf_ = @@ -106,6 +108,7 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&key_cache_)); allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); + allocator_->free((void**)(&context_output_buf_)); #ifdef USE_NCCL for (int i = 0; i < num_buffers_; ++i) { allocator_->free((void**)(&context_decoder_output_buf_clone_[i])); @@ -231,8 +234,8 @@ void LLaMA::forward(std::unordered_map* output_ten // input_tensors: // input_ids [num_tokens] // input_lengths [batch_size] + // target_ids [beam_width, num_tokens] // context_lengths [batch_size] - // num_tokens [1] int on cpu // seq_len [1] int on cpu // attn_len [1] int on cpu // is_context [1] int on cpu @@ -240,25 +243,30 @@ void LLaMA::forward(std::unordered_map* output_ten // output_tensors: // hidden_vector [num_tokens, hidden_size] // log_probs [num_tokens, vocab_size] - // cum_probs [batch_size] + // cum_probs [beam_width, batch_size] - FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 6"); - FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); + FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7"); + FT_CHECK(input_tensors->at("input_ids").shape.size() == 1); FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); + FT_CHECK(input_tensors->at("target_ids").shape.size() == 2); + FT_CHECK(input_tensors->at("context_lengths").shape.size() == 1); const DataType data_type = getTensorType(); const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); const size_t batch_size = input_tensors->at("input_lengths").shape[0]; - const int* input_ids = input_tensors->at("input_ids").getPtr(); - const int* context_lengths = input_tensors->at("context_lengths").getPtr(); - const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - const int num_tokens = input_tensors->at("num_tokens").getVal(); - const int seq_len = input_tensors->at("seq_len").getVal(); - const int attn_len = input_tensors->at("attn_len").getVal(); - const int is_context = input_tensors->at("is_context").getVal(); - T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); - float* log_probs = output_tensors->at("log_probs").getPtr(); - float* cum_probs = output_tensors->at("cum_probs").getPtr(); + const size_t num_tokens = input_tensors->at("input_ids").shape[0]; + const size_t beam_width = input_tensors->at("target_ids").shape[0]; + + const int* input_ids = input_tensors->at("input_ids").getPtr(); + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* target_ids = input_tensors->at("target_ids").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int seq_len = input_tensors->at("seq_len").getVal(); + const int attn_len = input_tensors->at("attn_len").getVal(); + const int is_context = input_tensors->at("is_context").getVal(); + T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); + float* log_probs = output_tensors->at("log_probs").getPtr(); + float* cum_probs = output_tensors->at("cum_probs").getPtr(); allocateBuffer(batch_size, seq_len, max_seq_len_); sync_check_cuda_error(); @@ -294,7 +302,7 @@ void LLaMA::forward(std::unordered_map* output_ten {"decoder_input", Tensor{MEMORY_GPU, data_type, - {(size_t)num_tokens, hidden_units_}, + {num_tokens, hidden_units_}, #ifdef USE_NCCL context_decoder_input_buf_ #else @@ -305,22 +313,19 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}}, - {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}}, {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}}, {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}}; if (is_unpadded_mha) { - decoder_input_tensors.insert( - {"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)num_tokens}, padding_offset_}}); - decoder_input_tensors.insert( - {"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}}); + decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}}); + decoder_input_tensors.insert({"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_seqlens_}}); } std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, data_type, - {(size_t)num_tokens, hidden_units_}, + {num_tokens, hidden_units_}, #ifdef USE_NCCL context_decoder_output_buf_ #else @@ -369,7 +374,45 @@ void LLaMA::forward(std::unordered_map* output_ten ftNcclGroupEnd(); #endif } - else if (!is_context) { + else if (is_context) { + invokeLLaMAGetLastTokens( + context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_); + sync_check_cuda_error(); + + invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, + context_output_buf_, + llama_weights->post_decoder_layernorm.gamma, + layernorm_eps_, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + vocab_size_, + batch_size, + hidden_units_, + llama_weights->post_decoder_embedding.kernel, + vocab_size_, + normed_decoder_output_buf_, + hidden_units_, // n + logits_buf_, + vocab_size_); + sync_check_cuda_error(); + cublas_wrapper_->setFP16GemmConfig(); + + invokeLLaMALogSoftmax(log_probs, logits_buf_, batch_size, vocab_size_, stream_); + sync_check_cuda_error(); + + invokeLLaMAExtractTargets( + cum_probs, log_probs, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_); + sync_check_cuda_error(); + } + else { invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, context_decoder_output_buf_, llama_weights->post_decoder_layernorm.gamma, @@ -400,7 +443,7 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); invokeLLaMAGatherTokens( - cum_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_); + cum_probs, log_probs, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_); sync_check_cuda_error(); } } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 7bb73c524..592861008 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -46,7 +46,7 @@ class LLaMA: public BaseLayer { cudaStream_t comm_stream_; cudaEvent_t kern_event_[num_buffers_]; cudaEvent_t comm_event_[num_buffers_]; - T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; + T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; #endif static constexpr float layernorm_eps_ = 1e-6f; @@ -77,12 +77,13 @@ class LLaMA: public BaseLayer { T* value_cache_ = nullptr; T* decoder_output_buf_ = nullptr; + T* context_output_buf_ = nullptr; T* normed_decoder_output_buf_ = nullptr; float* logits_buf_ = nullptr; float* log_likelihood_buf_ = nullptr; - T* context_decoder_input_buf_ = nullptr; - T* context_decoder_output_buf_ = nullptr; + T* context_decoder_input_buf_ = nullptr; + T* context_decoder_output_buf_ = nullptr; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 4a257a405..5c90f303e 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -175,9 +175,8 @@ void LLaMAContextDecoder::forward(std::vector* {"attention_mask", input_tensors->at(1)}, {"input_lengths", input_tensors->at(2)}, {"context_lengths", input_tensors->at(3)}, - {"num_tokens", input_tensors->at(4)}, - {"seq_len", input_tensors->at(5)}, - {"attn_len", input_tensors->at(6)}}; + {"seq_len", input_tensors->at(4)}, + {"attn_len", input_tensors->at(5)}}; std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, {"key_cache", output_tensors->at(1)}, {"value_cache", output_tensors->at(2)}}; @@ -195,7 +194,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* // attention_mask [batch_size, 1, seq_len, attn_len] // input_lengths [batch_size] // context_lengths [batch_size] - // num_tokens [1] int on cpu // seq_len [1] int on cpu // attn_len [1] int on cpu // padding_offset [batch_size] int on cpu @@ -210,20 +208,20 @@ void LLaMAContextDecoder::forward(std::unordered_map* FT_CHECK(output_tensors->size() == 3); const DataType data_type = getTensorType(); const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); - const int batch_size = input_tensors->at("input_lengths").shape[0]; - const int* input_lengths = input_tensors->at("input_lengths").getPtr(); - const int* context_lengths = input_tensors->at("context_lengths").getPtr(); - const int num_tokens = input_tensors->at("num_tokens").getVal(); - const int seq_len = input_tensors->at("attention_mask").shape[2]; - const int attn_len = input_tensors->at("attention_mask").shape[3]; - const int* padding_offset = nullptr; - const int* cu_seqlens = nullptr; + const size_t batch_size = input_tensors->at("input_lengths").shape[0]; + const size_t num_tokens = input_tensors->at("decoder_input").shape[0]; + const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; + + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int* padding_offset = nullptr; + const int* cu_seqlens = nullptr; if (is_unpadded_mha) { padding_offset = input_tensors->at("padding_offset").getPtr(); cu_seqlens = input_tensors->at("cu_seqlens").getPtr(); } - - const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; allocateBuffer(batch_size, seq_len, max_seq_len); sync_check_cuda_error(); @@ -244,11 +242,6 @@ void LLaMAContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } - size_t h_token_num = batch_size * seq_len; - if (is_unpadded_mha) { - h_token_num = num_tokens; - } - for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l) == false) { continue; @@ -270,13 +263,13 @@ void LLaMAContextDecoder::forward(std::unordered_map* layer_input, llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, layernorm_eps_, - h_token_num, + num_tokens, hidden_units_, stream_); sync_check_cuda_error(); TensorMap self_attention_input_tensors{ - {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, + {"input_query", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}, {"attention_mask", Tensor{MEMORY_GPU, data_type, @@ -291,7 +284,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* if (is_unpadded_mha) { self_attention_input_tensors.insert("padding_offset", - Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset}); + Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset}); self_attention_input_tensors.insert("cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens}); } @@ -302,7 +295,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* }; TensorMap self_attention_output_tensors{ - {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, + {"hidden_features", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, self_attn_output_}}, {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; @@ -319,22 +312,21 @@ void LLaMAContextDecoder::forward(std::unordered_map* llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, layernorm_eps_, - h_token_num, + num_tokens, hidden_units_, stream_); sync_check_cuda_error(); TensorMap ffn_input_tensors( - {{"ffn_input", - Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}}); TensorMap ffn_output_tensors( - {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}}); + {{"ffn_output", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, layer_output}}}); ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); invokeAddBiasResidual(layer_output, self_attn_output_, llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, - h_token_num, + num_tokens, hidden_units_, stream_); diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index b098f28f7..580b05d5c 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -78,12 +78,11 @@ std::vector LLaMA::forward(th::Tensor& hidden_vector, th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, + th::Tensor& target_ids, th::Tensor& context_lengths, - const int64_t num_tokens, const int64_t seq_len, const int64_t attn_len, - const int64_t is_context - ) + const int64_t is_context) { CHECK_TH_CUDA(input_ids); CHECK_CONTIGUOUS(input_ids); @@ -92,9 +91,16 @@ std::vector LLaMA::forward(th::Tensor& hidden_vector, CHECK_CONTIGUOUS(input_lengths); TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); - const int batch_size = input_lengths.size(0); - ftllama->forward( - hidden_vector, log_probs, cum_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len, is_context); + ftllama->forward(hidden_vector, + log_probs, + cum_probs, + input_ids, + input_lengths, + target_ids, + context_lengths, + seq_len, + attn_len, + is_context); return std::vector{hidden_vector, log_probs, cum_probs}; } diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index f5ffee6f7..029780e7f 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -34,8 +34,8 @@ class IFLLaMA { th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, + th::Tensor& target_ids, th::Tensor& context_lengths, - const int num_tokens, const int seq_len, const int attn_len, const int is_context) = 0; @@ -184,26 +184,29 @@ class FTLLaMA: public IFLLaMA { th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, + th::Tensor& target_ids, th::Tensor& context_lengths, - const int num_tokens, const int seq_len, const int attn_len, const int is_context) override { const size_t batch_size = (size_t)input_lengths.size(0); + const size_t num_tokens = (size_t)input_ids.size(0); + const size_t beam_width = (size_t)target_ids.size(0); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_INT32, - std::vector{batch_size, (size_t)seq_len}, - get_ptr(input_ids)}}, + ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{num_tokens}, get_ptr(input_ids)}}, {"input_lengths", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, + {"target_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{beam_width, num_tokens}, + get_ptr(target_ids)}}, {"context_lengths", ft::Tensor{ ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, - {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &num_tokens}}, {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &seq_len}}, {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}, {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &is_context}}}; @@ -212,15 +215,18 @@ class FTLLaMA: public IFLLaMA { {"hidden_vector", ft::Tensor{ft::MEMORY_GPU, (std::is_same::value) ? ft::TYPE_FP16 : ft::TYPE_FP32, - std::vector{(size_t)num_tokens, num_heads_ * size_per_head_}, + std::vector{num_tokens, num_heads_ * size_per_head_}, get_ptr(hidden_vector)}}, {"log_probs", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, - std::vector{(size_t)num_tokens, vocab_size_}, + std::vector{num_tokens, vocab_size_}, get_ptr(log_probs)}}, {"cum_probs", - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector{batch_size}, get_ptr(cum_probs)}}}; + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{beam_width, batch_size}, + get_ptr(cum_probs)}}}; try { ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_])); @@ -294,8 +300,8 @@ class LLaMA: public th::jit::CustomClassHolder { th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, + th::Tensor& target_ids, th::Tensor& context_lengths, - const int64_t num_tokens, const int64_t seq_len, const int64_t attn_len, const int64_t is_context); From a7e708917662079259a6fd9d3c545a90893c3ceb Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 1 Oct 2023 03:24:28 +0000 Subject: [PATCH 51/55] ref --- .../kernels/llama_kernels.cu | 26 ++ src/fastertransformer/kernels/llama_kernels.h | 2 + .../kernels/unfused_attention_kernels.cu | 66 ++-- .../kernels/unfused_attention_kernels.h | 16 +- .../LLaMAContextAttentionLayer.cc | 341 +++++++++--------- .../LLaMAContextAttentionLayer.h | 2 +- src/fastertransformer/models/llama/LLaMA.cc | 35 +- src/fastertransformer/models/llama/LLaMA.h | 27 +- .../models/llama/LLaMAContextDecoder.cc | 20 +- .../models/llama/LLaMAContextDecoder.h | 9 +- 10 files changed, 279 insertions(+), 265 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index d007350fe..d6d119227 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -327,4 +327,30 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream) template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream); template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream); +template +__global__ void LLaMAMemset0Kernel(T* dst, const int count) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + if (idx * X_ELEMS >= count) { + return; + } + + auto v_dst = reinterpret_cast(dst); + v_dst[idx] = {0}; +} + +template +void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + assert(count % x == 0); + int grid_sz = (count / x + block_sz - 1) / block_sz; + LLaMAMemset0Kernel<<>>(dst, count); +} + +template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream); +template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream); + } // namespace fastertransformer diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index 754ed6bba..f0d356a09 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -29,6 +29,8 @@ void invokeLLaMAInputIdsEmbeddingLookup(T* from_tensor, template void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); +template +void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream); void invokeLLaMAGatherTokens(float* out, const float* probs, diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 2f867186e..a513c1b47 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1902,11 +1902,11 @@ __global__ void transpose_4d_save_to_cache(T* k_dst, const T* k_src, T* v_dst, const T* v_src, + const int* context_lengths, const int head_num, const int size_per_head, const int seq_len, - const int max_seq_len, - const int* context_lengths) + const int max_seq_len) { // [batch_size, head_num, seq_len, size_per_head] const int batch_id = blockIdx.y; @@ -1943,20 +1943,20 @@ void invokeLLaMASaveToCache(T* k_dst, T* v_dst, const T* k_src, const T* v_src, - const int local_batch_size, + const int* context_lengths, + const int batch_size, + const int head_num, + const int size_per_head, const int seq_len, const int max_seq_len, - const int size_per_head, - const int local_head_num, - const int* context_lengths, cudaStream_t stream) { constexpr int block_sz = 128; constexpr int x = (sizeof(T) == 4) ? 4 : 8; - dim3 grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + dim3 grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num); transpose_4d_save_to_cache<<>>( - k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, context_lengths); + k_dst, k_src, v_dst, v_src, context_lengths, head_num, size_per_head, seq_len, max_seq_len); } #define INSTANTIATESAVETOCACHE(T) \ @@ -1964,12 +1964,12 @@ void invokeLLaMASaveToCache(T* k_dst, T* v_dst, \ const T* k_src, \ const T* v_src, \ - const int local_batch_size, \ + const int* context_lengths, \ + const int batch_size, \ + const int head_num, \ + const int size_per_head, \ const int seq_len, \ const int max_seq_len, \ - const int size_per_head, \ - const int local_head_num, \ - const int* start_pos, \ cudaStream_t stream) INSTANTIATESAVETOCACHE(float); INSTANTIATESAVETOCACHE(half); @@ -1979,19 +1979,19 @@ INSTANTIATESAVETOCACHE(__nv_bfloat16); #undef INSTANTIATESAVETOCACHE template -__global__ void transpose_4d_load_from_cache(T* k_dst, - const T* k_src, - T* v_dst, - const T* v_src, - const int head_num, - const int size_per_head, - const int seq_len, - const int max_seq_len, - const int attn_len) +__global__ void transpose_4d_load_from_cache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int attn_len, + const int max_seq_len) { // [batch_size, head_num, attn_len, size_per_head] - const int batch_id = blockIdx.y; - const int head_id = blockIdx.z; + const int batch_id = blockIdx.y; + const int head_id = blockIdx.z; // 16 byte loads will handle "x" dimension auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * max_seq_len @@ -2022,20 +2022,20 @@ void invokeLLaMALoadFromCache(T* k_dst, T* v_dst, const T* k_src, const T* v_src, - const int local_batch_size, - const int seq_len, - const int max_seq_len, + const int batch_size, + const int head_num, const int size_per_head, - const int local_head_num, + const int seq_len, const int attn_len, + const int max_seq_len, cudaStream_t stream) { constexpr int block_sz = 128; constexpr int x = (sizeof(T) == 4) ? 4 : 8; - dim3 grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); + dim3 grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num); transpose_4d_load_from_cache<<>>( - k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, attn_len); + k_dst, v_dst, k_src, v_src, head_num, size_per_head, seq_len, attn_len, max_seq_len); } #define INSTANTIATELOADFROMCACHE(T) \ @@ -2043,12 +2043,12 @@ void invokeLLaMALoadFromCache(T* k_dst, T* v_dst, \ const T* k_src, \ const T* v_src, \ - const int local_batch_size, \ + const int batch_size, \ + const int head_num, \ + const int size_per_head, \ const int seq_len, \ + const int attn_len, \ const int max_seq_len, \ - const int size_per_head, \ - const int local_head_num, \ - const int attn_len, \ cudaStream_t stream) INSTANTIATELOADFROMCACHE(float); INSTANTIATELOADFROMCACHE(half); diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 52fa0f053..4f55af19e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -209,24 +209,24 @@ void invokeLLaMASaveToCache(T* k_dst, T* v_dst, const T* k_src, const T* v_src, - const int local_batch_size, + const int* context_lengths, + const int batch_size, + const int head_num, + const int size_per_head, const int seq_len, const int max_seq_len, - const int size_per_head, - const int local_head_num, - const int* start_pos, cudaStream_t stream); template void invokeLLaMALoadFromCache(T* k_dst, T* v_dst, const T* k_src, const T* v_src, - const int local_batch_size, - const int seq_len, - const int max_seq_len, + const int batch_size, + const int head_num, const int size_per_head, - const int local_head_num, + const int seq_len, const int attn_len, + const int max_seq_len, cudaStream_t stream); template diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index c8777c4cc..10d7c7673 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -31,12 +31,10 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten // input_tensors: // input_query [num_tokens, hidden_dimension] // attention_mask [batch_size, 1, seq_len, attn_len] - // attention_type [1] - // layer_id [1], int on cpu // context_lengths, int, [batch_size] - // attn_len, int, [batch_size] on cpu - // padding_offset, int, [num_tokens] (optional) - // cu_seqlens, int, [batch_size] (optional) + // attention_type [1] + // padding_offset [num_tokens] (optional) + // cu_seqlens [batch_size+1] (optional) // output_tensors: // hidden_features [num_tokens, hidden_dimension] @@ -46,25 +44,27 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); - const int batch_size = input_tensors->at("attention_mask").shape[0]; - const int max_seq_len = (int)(output_tensors->at("key_cache").shape[2]); - const int layer_id = input_tensors->getVal("layer_id"); - const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); - const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); - const int* context_lengths = input_tensors->at("context_lengths").getPtr(); - const int seq_len = input_tensors->at("attention_mask").shape[2]; - const int attn_len = input_tensors->at("attention_mask").shape[3]; - - T* attention_out = output_tensors->at("hidden_features").getPtr(); - T* attention_input = input_tensors->at("input_query").getPtr(); - T* attention_mask = input_tensors->at("attention_mask").getPtr(); - - const AttentionType attention_type = input_tensors->getVal("attention_type"); + const int batch_size = input_tensors->at("attention_mask").shape[0]; + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int max_seq_len = output_tensors->at("key_cache").shape[2]; + + T* attention_input = input_tensors->at("input_query").getPtr(); + T* attention_mask = input_tensors->at("attention_mask").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const AttentionType attention_type = input_tensors->getVal("attention_type"); + T* attention_out = output_tensors->at("hidden_features").getPtr(); + T* key_cache = output_tensors->getPtr("key_cache"); + T* value_cache = output_tensors->getPtr("value_cache"); + + FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len"); FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA, "LLaMA Context FUSED_PADDED_MHA is not supported !"); PUSH_RANGE("attention buffer alloc"); - allocateBuffer(batch_size, seq_len, max_seq_len, attention_type != AttentionType::FUSED_MHA); + allocateBuffer(batch_size, seq_len, attn_len, max_seq_len, attention_type != AttentionType::FUSED_MHA); POP_RANGE; sync_check_cuda_error(); @@ -85,11 +85,11 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten 3 * hidden_units_ /* n */); sync_check_cuda_error(); -// if (padding_offset != nullptr) { -// // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous -// cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_); -// sync_check_cuda_error(); -// } + if (padding_offset != nullptr) { + // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous + cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_); + sync_check_cuda_error(); + } invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, @@ -106,177 +106,169 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten stream_); sync_check_cuda_error(); - // key_cache [batch, local_head_num, max_seq_len, size_per_head] - // value_cache [batch, local_head_num, max_seq_len, size_per_head] - T* key_cache = output_tensors->getPtr("key_cache"); - T* value_cache = output_tensors->getPtr("value_cache"); invokeLLaMASaveToCache(key_cache, value_cache, k_buf_2_, v_buf_2_, + context_lengths, batch_size, + head_num_, + size_per_head_, seq_len, max_seq_len, - size_per_head_, - head_num_, - context_lengths, stream_); sync_check_cuda_error(); - POP_RANGE; invokeLLaMALoadFromCache(k_buf_2_, v_buf_2_, key_cache, value_cache, batch_size, - seq_len, - max_seq_len, - size_per_head_, head_num_, + size_per_head_, + seq_len, attn_len, + max_seq_len, stream_); + sync_check_cuda_error(); - if (attention_type == AttentionType::FUSED_MHA) { - dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size); - dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_); + POP_RANGE; + + const cudaDataType_t gemm_data_type = getCudaDataType(); + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = attn_len; // kv length + const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); + + // + // softmax(Q*K^T) + // + if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { + PUSH_RANGE("Q*K batch gemm"); + + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, // n + attention_seq_len_1, // m + size_per_head_, // k + 1.0f, + k_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_2 * size_per_head_, // n * k + q_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_1 * size_per_head_, // m * k + 0.0f, + qk_buf_float_, + CUDA_R_32F, + attention_seq_len_2, // n + attention_seq_len_2 * attention_seq_len_1, + batch_size * head_num_, // global batch size + CUDA_R_32F); + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; } else { - const cudaDataType_t gemm_data_type = getCudaDataType(); - const int attention_seq_len_1 = seq_len; // q length - const int attention_seq_len_2 = attn_len; // kv length - const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); - - // - // softmax(Q*K^T) - // - if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { - PUSH_RANGE("Q*K batch gemm"); - - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, // n - attention_seq_len_1, // m - size_per_head_, // k - 1.0f, - k_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_2 * size_per_head_, // n * k - q_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_1 * size_per_head_, // m * k - 0.0f, - qk_buf_float_, - CUDA_R_32F, - attention_seq_len_2, // n - attention_seq_len_2 * attention_seq_len_1, - batch_size * head_num_, // global batch size - CUDA_R_32F); - sync_check_cuda_error(); - POP_RANGE; - - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } - else { - PUSH_RANGE("Q*K batch gemm"); - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, - attention_seq_len_1, - size_per_head_, - k_buf_2_, - size_per_head_, - attention_seq_len_2 * size_per_head_, - q_buf_2_, - size_per_head_, - attention_seq_len_1 * size_per_head_, - qk_buf_, - attention_seq_len_2, - attention_seq_len_2 * attention_seq_len_1, - batch_size * head_num_); - - POP_RANGE; - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } - - PUSH_RANGE("QK*V batch gemm"); - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + PUSH_RANGE("Q*K batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, CUBLAS_OP_N, - size_per_head_, - attention_seq_len_1, attention_seq_len_2, - - v_buf_2_, + attention_seq_len_1, + size_per_head_, + k_buf_2_, size_per_head_, attention_seq_len_2 * size_per_head_, - - qk_buf_, - attention_seq_len_2, - attention_seq_len_1 * attention_seq_len_2, - - qkv_buf_2_, + q_buf_2_, size_per_head_, attention_seq_len_1 * size_per_head_, - + qk_buf_, + attention_seq_len_2, + attention_seq_len_2 * attention_seq_len_1, batch_size * head_num_); - sync_check_cuda_error(); - // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) - if (padding_offset == nullptr) { - invokeTransposeQKV(qkv_buf_3_, - qkv_buf_2_, - batch_size, - attention_seq_len_1, - head_num_, - size_per_head_, - attention_weights->attention_output_weight.scale, - 0, // int8_mode - stream_); - sync_check_cuda_error(); - } - else { - invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, - qkv_buf_3_, - num_tokens, - batch_size, - attention_seq_len_1, - head_num_, - size_per_head_, - padding_offset, - attention_weights->attention_output_weight.scale, - 0, // int8_mode - stream_); - sync_check_cuda_error(); - } + POP_RANGE; + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); POP_RANGE; } + + PUSH_RANGE("QK*V batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + size_per_head_, + attention_seq_len_1, + attention_seq_len_2, + + v_buf_2_, + size_per_head_, + attention_seq_len_2 * size_per_head_, + + qk_buf_, + attention_seq_len_2, + attention_seq_len_1 * attention_seq_len_2, + + qkv_buf_2_, + size_per_head_, + attention_seq_len_1 * size_per_head_, + + batch_size * head_num_); + sync_check_cuda_error(); + + // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) + if (padding_offset == nullptr) { + invokeTransposeQKV(qkv_buf_3_, + qkv_buf_2_, + batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); + sync_check_cuda_error(); + } + else { + invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, + qkv_buf_3_, + num_tokens, + batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + padding_offset, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); + sync_check_cuda_error(); + } + POP_RANGE; sync_check_cuda_error(); PUSH_RANGE("proj gemm"); @@ -387,21 +379,18 @@ void LLaMAContextAttentionLayer::allocateBuffer() } template -void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, - size_t seq_len, - size_t max_seq_len, - bool allocate_qk_buf) +void LLaMAContextAttentionLayer::allocateBuffer( + size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * max_seq_len * 3 * hidden_units_, false); - k_buf_2_ = q_buf_2_ + batch_size * max_seq_len * hidden_units_; - v_buf_2_ = k_buf_2_ + batch_size * max_seq_len * hidden_units_; + q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false); + k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; + v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_; // save memory usage when using fmha if (allocate_qk_buf) { - qk_buf_ = - (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * max_seq_len * max_seq_len, false); + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false); } else { allocator_->free((void**)(&qk_buf_)); @@ -413,7 +402,7 @@ void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, if (is_qk_buf_float_ == true) { if (allocate_qk_buf) { qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * max_seq_len * max_seq_len, false); + qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false); } else { allocator_->free((void**)(&qk_buf_float_)); @@ -431,6 +420,8 @@ void LLaMAContextAttentionLayer::freeBuffer() FT_LOG_DEBUG(__PRETTY_FUNCTION__); allocator_->free((void**)(&qkv_buf_)); allocator_->free((void**)(&q_buf_2_)); + allocator_->free((void**)(&k_buf_2_)); + allocator_->free((void**)(&v_buf_2_)); allocator_->free((void**)(&qk_buf_)); allocator_->free((void**)(&qkv_buf_2_)); allocator_->free((void**)(&qkv_buf_3_)); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 7300186ba..4557abf1d 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -38,7 +38,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { std::unique_ptr dispatcher_fp16; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len, bool allocate_qk_buf); + void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf); void freeBuffer() override; using BaseAttentionLayer::is_free_buffer_after_forward_; diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index f0f6d4697..55c507318 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -59,33 +59,29 @@ void LLaMA::allocateBuffer() } template -void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) +void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_; padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); input_attention_mask_ = - (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false)); + (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * attn_len, false)); - key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); - value_cache_ = key_cache_ + self_cache_size; + if (is_context) { + const size_t self_cache_size = + (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len_ * hidden_units_; + key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); + value_cache_ = key_cache_ + self_cache_size; + } context_decoder_input_buf_ = (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); context_decoder_output_buf_ = (T*)(allocator_->reMalloc( context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); -#ifdef USE_NCCL - for (int i = 0; i < num_buffers_; ++i) { - context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc( - context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); - } -#endif - context_output_buf_ = (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); normed_decoder_output_buf_ = @@ -95,6 +91,13 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_ log_likelihood_buf_ = (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); +#ifdef USE_NCCL + for (int i = 0; i < num_buffers_; ++i) { + context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc( + context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); + } +#endif + is_allocate_buffer_ = true; } @@ -109,12 +112,14 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&context_decoder_input_buf_)); allocator_->free((void**)(&context_decoder_output_buf_)); allocator_->free((void**)(&context_output_buf_)); + allocator_->free((void**)(&normed_decoder_output_buf_)); + allocator_->free((void**)(&logits_buf_)); + allocator_->free((void**)(&log_likelihood_buf_)); #ifdef USE_NCCL for (int i = 0; i < num_buffers_; ++i) { allocator_->free((void**)(&context_decoder_output_buf_clone_[i])); } #endif - allocator_->free((void**)(&logits_buf_)); is_allocate_buffer_ = false; } } @@ -268,7 +273,9 @@ void LLaMA::forward(std::unordered_map* output_ten float* log_probs = output_tensors->at("log_probs").getPtr(); float* cum_probs = output_tensors->at("cum_probs").getPtr(); - allocateBuffer(batch_size, seq_len, max_seq_len_); + FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len"); + + allocateBuffer(batch_size, seq_len, attn_len, is_context); sync_check_cuda_error(); if (is_unpadded_mha) { diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 592861008..ee9442158 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -64,26 +64,23 @@ class LLaMA: public BaseLayer { LLaMAContextDecoder* llama_context_decoder_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context); void freeBuffer() override; void initialize(); protected: - int* padding_offset_ = nullptr; - int* cu_seqlens_ = nullptr; - T* input_attention_mask_ = nullptr; - T* key_cache_ = nullptr; - T* value_cache_ = nullptr; - - T* decoder_output_buf_ = nullptr; - T* context_output_buf_ = nullptr; - T* normed_decoder_output_buf_ = nullptr; - float* logits_buf_ = nullptr; - float* log_likelihood_buf_ = nullptr; - - T* context_decoder_input_buf_ = nullptr; - T* context_decoder_output_buf_ = nullptr; + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + T* input_attention_mask_ = nullptr; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; + T* context_output_buf_ = nullptr; + T* normed_decoder_output_buf_ = nullptr; + float* logits_buf_ = nullptr; + float* log_likelihood_buf_ = nullptr; + T* context_decoder_input_buf_ = nullptr; + T* context_decoder_output_buf_ = nullptr; void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, const std::unordered_map* input_tensors); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 5c90f303e..2709b2164 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -60,7 +60,7 @@ void LLaMAContextDecoder::allocateBuffer() } template -void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len) +void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) { decoder_normed_input_ = reinterpret_cast( @@ -175,8 +175,7 @@ void LLaMAContextDecoder::forward(std::vector* {"attention_mask", input_tensors->at(1)}, {"input_lengths", input_tensors->at(2)}, {"context_lengths", input_tensors->at(3)}, - {"seq_len", input_tensors->at(4)}, - {"attn_len", input_tensors->at(5)}}; + {"seq_len", input_tensors->at(4)}}; std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, {"key_cache", output_tensors->at(1)}, {"value_cache", output_tensors->at(2)}}; @@ -195,22 +194,20 @@ void LLaMAContextDecoder::forward(std::unordered_map* // input_lengths [batch_size] // context_lengths [batch_size] // seq_len [1] int on cpu - // attn_len [1] int on cpu // padding_offset [batch_size] int on cpu // cu_seqlens [batch_size+1] int on cpu // output tensors: // decoder_output [num_tokens, hidden_dimension], // key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] - // value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head] + // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] - FT_CHECK(input_tensors->size() >= 7); + FT_CHECK(input_tensors->size() >= 5); FT_CHECK(output_tensors->size() == 3); const DataType data_type = getTensorType(); const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); const size_t batch_size = input_tensors->at("input_lengths").shape[0]; const size_t num_tokens = input_tensors->at("decoder_input").shape[0]; - const size_t max_seq_len = output_tensors->at("key_cache").shape[3]; const int* input_lengths = input_tensors->at("input_lengths").getPtr(); const int* context_lengths = input_tensors->at("context_lengths").getPtr(); @@ -222,7 +219,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* padding_offset = input_tensors->at("padding_offset").getPtr(); cu_seqlens = input_tensors->at("cu_seqlens").getPtr(); } - allocateBuffer(batch_size, seq_len, max_seq_len); + allocateBuffer(batch_size, seq_len); sync_check_cuda_error(); T* decoder_input = input_tensors->at("decoder_input").getPtr(); @@ -250,14 +247,12 @@ void LLaMAContextDecoder::forward(std::unordered_map* const bool is_final = false; T* layer_input = decoder_layer_output_; T* layer_output = decoder_layer_output_; - // if (!is_unpadded_mha) { if (isFirstLayerParallelId(l)) { layer_input = decoder_input; } if (isLastLayerParallelId(l)) { layer_output = decoder_output; } - // } invokeGeneralLLaMALayerNorm(decoder_normed_input_, layer_input, @@ -275,11 +270,8 @@ void LLaMAContextDecoder::forward(std::unordered_map* data_type, {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)}, attention_mask}}, - {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, - {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}, - {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}}, - {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &attn_len}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, }; if (is_unpadded_mha) { diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index eb4e64ef0..94eb82a37 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -56,7 +56,7 @@ class LLaMAContextDecoder: public BaseLayer { FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -67,10 +67,9 @@ class LLaMAContextDecoder: public BaseLayer { void initialize(); protected: - T* decoder_normed_input_ = nullptr; - T* self_attn_output_ = nullptr; - T* decoder_layer_output_ = nullptr; - size_t* h_pinned_token_num_ptr_ = nullptr; + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; public: LLaMAContextDecoder(size_t head_num, From 5515d83db320d01158466393066189975e813163 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 1 Oct 2023 05:38:47 +0000 Subject: [PATCH 52/55] remove mpi requirement --- examples/cpp/CMakeLists.txt | 2 +- .../LLaMAContextAttentionLayer.cc | 38 ----- .../LLaMAContextAttentionLayer.h | 21 +-- .../models/llama/CMakeLists.txt | 3 - src/fastertransformer/models/llama/LLaMA.cc | 143 ++---------------- src/fastertransformer/models/llama/LLaMA.h | 66 ++------ .../models/llama/LLaMAContextDecoder.cc | 29 ++-- .../models/llama/LLaMAContextDecoder.h | 11 +- src/fastertransformer/th_op/llama/LLaMA.cc | 12 +- src/fastertransformer/th_op/llama/LLaMA.h | 28 ++-- 10 files changed, 66 insertions(+), 287 deletions(-) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 38ae86412..800dfdd7f 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -26,8 +26,8 @@ add_subdirectory(wenet) add_subdirectory(gptj) add_subdirectory(gptneox) -add_subdirectory(llama) add_subdirectory(multi_gpu_gpt) +#add_subdirectory(llama) if(ENABLE_FP8) add_subdirectory(gpt_fp8) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 10d7c7673..daf3d9178 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -293,43 +293,6 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, - size_t size_per_head, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float): - BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), - head_num_(head_num), - size_per_head_(size_per_head), - hidden_units_(head_num * size_per_head), - rotary_embedding_dim_(0), - is_qk_buf_float_(is_qk_buf_float) -{ -} - -template -LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, - size_t size_per_head, - size_t local_head_num, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float): - BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), - head_num_(head_num), - size_per_head_(size_per_head), - hidden_units_(head_num * size_per_head), - rotary_embedding_dim_(0), - is_qk_buf_float_(is_qk_buf_float) -{ - FT_LOG_DEBUG(__PRETTY_FUNCTION__); - dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); -} - template LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, @@ -348,7 +311,6 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_ is_qk_buf_float_(is_qk_buf_float) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); - dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f)); } template diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 4557abf1d..8d24689a8 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -34,9 +34,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { const size_t rotary_embedding_dim_; // fmha runner - int sm_ = getSMVersion(); - std::unique_ptr dispatcher_fp16; - + int sm_ = getSMVersion(); void allocateBuffer() override; void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf); void freeBuffer() override; @@ -60,23 +58,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { T* qkv_buf_3_ = nullptr; public: - LLaMAContextAttentionLayer(size_t head_num, - size_t size_per_head, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float); - - LLaMAContextAttentionLayer(size_t head_num, - size_t size_per_head, - size_t local_head_num, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float); - LLaMAContextAttentionLayer(size_t head_num, size_t size_per_head, size_t local_head_num, diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt index 24acf1d78..287a350da 100644 --- a/src/fastertransformer/models/llama/CMakeLists.txt +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -44,10 +44,7 @@ set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LLaMA PUBLIC -lcudart LLaMAContextDecoder decoding_kernels - gpt_kernels llama_kernels - BaseBeamSearchLayer - bert_preprocess_kernels tensor LLaMAWeight cuda_utils diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 55c507318..3884a79ac 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -15,10 +15,8 @@ */ #include "src/fastertransformer/models/llama/LLaMA.h" -#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/llama_kernels.h" -#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/fastertransformer/utils/llama_utils.h" #include "src/fastertransformer/utils/memory_utils.h" #include @@ -29,21 +27,14 @@ namespace fastertransformer { template void LLaMA::initialize() { -#ifdef USE_NCCL - check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking)); - for (int i = 0; i < num_buffers_; ++i) { - check_cuda_error(cudaEventCreate(&kern_event_[i])); - check_cuda_error(cudaEventCreate(&comm_event_[i])); - } -#endif - llama_context_decoder_ = new LLaMAContextDecoder(head_num_, size_per_head_, inter_size_, num_layer_, rotary_embedding_dim_, layernorm_eps_, - pipeline_para_, + rank_, + world_size_, stream_, cublas_wrapper_, allocator_, @@ -72,7 +63,7 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len if (is_context) { const size_t self_cache_size = - (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len_ * hidden_units_; + (num_layer_ / world_size_) * batch_size * max_seq_len_ * hidden_units_; key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); value_cache_ = key_cache_ + self_cache_size; } @@ -91,13 +82,6 @@ void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len log_likelihood_buf_ = (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); -#ifdef USE_NCCL - for (int i = 0; i < num_buffers_; ++i) { - context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc( - context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false)); - } -#endif - is_allocate_buffer_ = true; } @@ -115,11 +99,6 @@ void LLaMA::freeBuffer() allocator_->free((void**)(&normed_decoder_output_buf_)); allocator_->free((void**)(&logits_buf_)); allocator_->free((void**)(&log_likelihood_buf_)); -#ifdef USE_NCCL - for (int i = 0; i < num_buffers_; ++i) { - allocator_->free((void**)(&context_decoder_output_buf_clone_[i])); - } -#endif is_allocate_buffer_ = false; } } @@ -133,6 +112,8 @@ LLaMA::LLaMA(size_t head_num, size_t rotary_embedding_dim, size_t random_seed, size_t max_seq_len, + size_t rank, + size_t world_size, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, @@ -149,41 +130,8 @@ LLaMA::LLaMA(size_t head_num, random_seed_(random_seed), max_seq_len_(max_seq_len), hidden_units_(head_num * size_per_head), - attention_type_(attention_type) -{ - pipeline_para_.world_size_ = 1; - pipeline_para_.rank_ = 0; - initialize(); -} - -template -LLaMA::LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - size_t random_seed, - size_t max_seq_len, - NcclParam tensor_para, - NcclParam pipeline_para, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop, - AttentionType attention_type): - BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), - head_num_(head_num), - size_per_head_(size_per_head), - inter_size_(inter_size), - num_layer_(num_layer), - vocab_size_(vocab_size), - rotary_embedding_dim_(rotary_embedding_dim), - random_seed_(random_seed), - max_seq_len_(max_seq_len), - hidden_units_(head_num * size_per_head), - pipeline_para_(pipeline_para), + rank_(rank), + world_size_(world_size), attention_type_(attention_type) { initialize(); @@ -201,7 +149,8 @@ LLaMA::LLaMA(LLaMA const& llama): random_seed_(llama.random_seed_), max_seq_len_(llama.max_seq_len_), hidden_units_(llama.hidden_units_), - pipeline_para_(llama.pipeline_para_), + rank_(llama.rank_), + world_size_(llama.world_size_), attention_type_(llama.attention_type_) { initialize(); @@ -210,14 +159,6 @@ LLaMA::LLaMA(LLaMA const& llama): template LLaMA::~LLaMA() { -#ifdef USE_NCCL - check_cuda_error(cudaStreamDestroy(comm_stream_)); - for (int i = 0; i < num_buffers_; ++i) { - check_cuda_error(cudaEventDestroy(kern_event_[i])); - check_cuda_error(cudaEventDestroy(comm_event_[i])); - } -#endif - delete llama_context_decoder_; freeBuffer(); } @@ -288,7 +229,7 @@ void LLaMA::forward(std::unordered_map* output_ten input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_); sync_check_cuda_error(); - if (pipeline_para_.rank_ == 0) { + if (rank_ == 0) { invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_, llama_weights->pre_decoder_embedding_table, input_ids, @@ -297,24 +238,13 @@ void LLaMA::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); } - else { -#ifdef USE_NCCL - ftNcclRecv( - context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_); - sync_check_cuda_error(); -#endif - } std::unordered_map decoder_input_tensors{ {"decoder_input", Tensor{MEMORY_GPU, data_type, {num_tokens, hidden_units_}, -#ifdef USE_NCCL - context_decoder_input_buf_ -#else - pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector -#endif + rank_ == 0 ? context_decoder_input_buf_ : hidden_vector }}, {"attention_mask", Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, @@ -333,55 +263,24 @@ void LLaMA::forward(std::unordered_map* output_ten Tensor{MEMORY_GPU, data_type, {num_tokens, hidden_units_}, -#ifdef USE_NCCL - context_decoder_output_buf_ -#else - (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector -#endif + (rank_ == world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector }}, {"key_cache", Tensor{MEMORY_GPU, data_type, - {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, key_cache_}}, {"value_cache", Tensor{MEMORY_GPU, data_type, - {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, value_cache_}}}; llama_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); sync_check_cuda_error(); - if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) { -#ifdef USE_NCCL - // buf_no_ = (buf_no_ + 1) % num_buffers_; - // check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_])); - // invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_], - // context_decoder_output_buf_, - // num_tokens * hidden_units_, - // stream_); - // sync_check_cuda_error(); - // check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_)); - // check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_])); - // ftNcclGroupStart(); - // ftNcclSend(context_decoder_output_buf_clone_[buf_no_], - // num_tokens * hidden_units_, - // pipeline_para_.rank_ + 1, - // pipeline_para_, - // comm_stream_); - // ftNcclGroupEnd(); - // check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_)); - // sync_check_cuda_error(); - - ftNcclGroupStart(); - ftNcclSend( - context_decoder_output_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, stream_); - ftNcclGroupEnd(); -#endif - } - else if (is_context) { + if (is_context) { invokeLLaMAGetLastTokens( context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_); sync_check_cuda_error(); @@ -455,18 +354,6 @@ void LLaMA::forward(std::unordered_map* output_ten } } -template -size_t LLaMA::getPipelineParallelRank() -{ - return pipeline_para_.rank_; -} - -template -size_t LLaMA::getPipelineParallelSize() -{ - return pipeline_para_.world_size_; -} - template class LLaMA; template class LLaMA; diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index ee9442158..7be6120ab 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -23,42 +23,26 @@ #include "src/fastertransformer/models/llama/LLaMAWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" -//#define USE_NCCL - namespace fastertransformer { template class LLaMA: public BaseLayer { private: // meta data - size_t head_num_; - size_t size_per_head_; - size_t inter_size_; - size_t num_layer_; - size_t vocab_size_; - size_t rotary_embedding_dim_; - size_t random_seed_; - size_t max_seq_len_; - -#ifdef USE_NCCL - static constexpr int num_buffers_ = 5; - int buf_no_ = 0; - cudaStream_t comm_stream_; - cudaEvent_t kern_event_[num_buffers_]; - cudaEvent_t comm_event_[num_buffers_]; - T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr}; -#endif - + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t vocab_size_; + size_t rotary_embedding_dim_; + size_t random_seed_; + size_t max_seq_len_; + size_t hidden_units_; + size_t rank_; + size_t world_size_; static constexpr float layernorm_eps_ = 1e-6f; - - size_t hidden_units_; - - NcclParam tensor_para_; - NcclParam pipeline_para_; - - AttentionType attention_type_; - - const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr + AttentionType attention_type_; + const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); LLaMAContextDecoder* llama_context_decoder_; @@ -94,23 +78,8 @@ class LLaMA: public BaseLayer { size_t rotary_embedding_dim, size_t random_seed, size_t max_seq_len, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - AttentionType attention_type = AttentionType::UNFUSED_MHA); - - LLaMA(size_t head_num, - size_t size_per_head, - size_t inter_size, - size_t num_layer, - size_t vocab_size, - size_t rotary_embedding_dim, - size_t random_seed, - size_t max_seq_len, - NcclParam tensor_para, - NcclParam pipeline_para, + size_t rank, + size_t world_size, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, @@ -130,11 +99,6 @@ class LLaMA: public BaseLayer { const std::unordered_map* input_tensors, const LLaMAWeight* llama_weights); - size_t getPipelineParallelRank(); - size_t getPipelineParallelSize(); - size_t getTensorParallelRank(); - size_t getTensorParallelSize(); - bool* getFinishBuffer(); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 2709b2164..1d5901c61 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -15,10 +15,7 @@ */ #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" -#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" -#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/kernels/llama_kernels.h" - #include "src/fastertransformer/layers/FfnLayer.h" #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" #include "src/fastertransformer/utils/llama_utils.h" @@ -86,30 +83,29 @@ void LLaMAContextDecoder::freeBuffer() template bool LLaMAContextDecoder::isValidLayerParallelId(uint l) { - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) - && (l < local_num_layer * (pipeline_para_.rank_ + 1)); + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l >= local_num_layer * rank_) && (l < local_num_layer * (rank_ + 1)); } template bool LLaMAContextDecoder::isFirstLayerParallelId(uint l) { - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l == local_num_layer * rank_); } template bool LLaMAContextDecoder::isLastLayerParallelId(uint l) { - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l == local_num_layer * (rank_ + 1) - 1); } template int LLaMAContextDecoder::getFirstLayerParallelId() { - int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); - return local_num_layer * pipeline_para_.rank_; + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return local_num_layer * rank_; } template @@ -119,7 +115,8 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, size_t num_layer, size_t rotary_embedding_dim, float layernorm_eps, - NcclParam pipeline_para, + size_t rank, + size_t world_size, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, @@ -134,7 +131,8 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, rotary_embedding_dim_(rotary_embedding_dim), layernorm_eps_(layernorm_eps), hidden_units_(head_num * size_per_head), - pipeline_para_(pipeline_para), + rank_(rank), + world_size_(world_size), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type) { @@ -151,7 +149,8 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode rotary_embedding_dim_(decoder.rotary_embedding_dim_), layernorm_eps_(decoder.layernorm_eps_), hidden_units_(decoder.hidden_units_), - pipeline_para_(decoder.pipeline_para_), + rank_(decoder.rank_), + world_size_(decoder.world_size_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_) { diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index 94eb82a37..94d960dab 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -42,14 +42,10 @@ class LLaMAContextDecoder: public BaseLayer { size_t num_layer_; size_t rotary_embedding_dim_; float layernorm_eps_; - - // calculated data size_t hidden_units_; - - NcclParam pipeline_para_; - + size_t rank_; + size_t world_size_; AttentionType attention_type_; - bool is_qk_buf_float_; BaseAttentionLayer* self_attention_layer_; @@ -78,7 +74,8 @@ class LLaMAContextDecoder: public BaseLayer { size_t num_layer, size_t rotary_embedding_dim, float layernorm_eps, - NcclParam pipeline_para, + size_t rank, + size_t world_size, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index 580b05d5c..ef9cca14f 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -27,8 +27,8 @@ LLaMA::LLaMA(const int64_t num_heads, const int64_t rotary_embedding_dim, const int64_t random_seed, const int64_t max_seq_len, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, + const int64_t rank, + const int64_t world_size, const vector weights): vocab_size_(vocab_size), st_(weights[0].scalar_type()) { @@ -46,8 +46,8 @@ LLaMA::LLaMA(const int64_t num_heads, (size_t)rotary_embedding_dim, (size_t)random_seed, (size_t)max_seq_len, - tensor_para_size, - pipeline_para_size, + (size_t)rank, + (size_t)world_size, weights); break; case at::ScalarType::Half: @@ -59,8 +59,8 @@ LLaMA::LLaMA(const int64_t num_heads, (size_t)rotary_embedding_dim, (size_t)random_seed, (size_t)max_seq_len, - tensor_para_size, - pipeline_para_size, + (size_t)rank, + (size_t)world_size, weights); break; default: diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index 029780e7f..db8b8aeec 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -52,8 +52,8 @@ class FTLLaMA: public IFLLaMA { const size_t rotary_embedding_dim, const size_t random_seed, const size_t max_seq_len, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, + const int64_t rank, + const int64_t world_size, const vector weights): num_heads_(num_heads), size_per_head_(size_per_head), @@ -63,8 +63,8 @@ class FTLLaMA: public IFLLaMA { rotary_embedding_dim_(rotary_embedding_dim), random_seed_(random_seed), max_seq_len_(max_seq_len), - tensor_para_size_(tensor_para_size), - pipeline_para_size_(pipeline_para_size), + rank_(rank), + world_size_(world_size), weights_(weights) { ft::Logger::getLogger().setLevel(ft::Logger::WARNING); @@ -73,8 +73,6 @@ class FTLLaMA: public IFLLaMA { cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); cublas_wrapper_mutex_ = new std::mutex(); - ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size); - llama_weights_.resizeLayer(num_layers_); for (int i = 0; i < (int)num_layers_; i++) { llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = @@ -113,7 +111,6 @@ class FTLLaMA: public IFLLaMA { llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); - // ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); ft::check_cuda_error(cudaStreamCreate(&stream_)); for (int i = 0; i < num_events_; ++i) { @@ -150,8 +147,8 @@ class FTLLaMA: public IFLLaMA { rotary_embedding_dim_, random_seed_, max_seq_len_, - tensor_para_, - pipeline_para_, + rank_, + world_size_, stream_, cublas_wrapper_, allocator_, @@ -172,8 +169,6 @@ class FTLLaMA: public IFLLaMA { delete cublas_wrapper_; delete allocator_; - ft::ftNcclParamDestroy(tensor_para_); - ft::ftNcclParamDestroy(pipeline_para_); cublasLtDestroy(cublasltHandle_); delete cublas_algo_map_; delete cublas_wrapper_mutex_; @@ -256,8 +251,8 @@ class FTLLaMA: public IFLLaMA { const size_t rotary_embedding_dim_; const size_t random_seed_; const size_t max_seq_len_; - int64_t tensor_para_size_; - int64_t pipeline_para_size_; + const size_t rank_; + const size_t world_size_; static constexpr int num_events_ = 5; int ev_no_ = 0; @@ -271,9 +266,6 @@ class FTLLaMA: public IFLLaMA { struct cudaDeviceProp prop_; ft::LLaMAWeight llama_weights_; - ft::NcclParam tensor_para_; - ft::NcclParam pipeline_para_; - ft::cublasMMWrapper* cublas_wrapper_; ft::IAllocator* allocator_; ft::LLaMA* llama_ = nullptr; @@ -289,8 +281,8 @@ class LLaMA: public th::jit::CustomClassHolder { const int64_t rotary_embedding_dim, const int64_t random_seed, const int64_t max_seq_len, - const int64_t tensor_para_size, - const int64_t pipeline_para_size, + const int64_t rank, + const int64_t world_size, const vector weights); ~LLaMA(); From ce8c72a3040c2ec075cb34d98357c640ab1ef00e Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 1 Oct 2023 07:04:48 +0000 Subject: [PATCH 53/55] add mpi_cxx --- CMakeLists.txt | 2 +- src/fastertransformer/utils/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 870e67f0a..0d879611a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -418,7 +418,7 @@ add_library(transformer-shared SHARED if (BUILD_MULTI_GPU) target_link_libraries(transformer-shared PUBLIC - -lmpi + -lmpi -lmpi_cxx ${NCCL_LIBRARIES} ) endif() diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt index 9796ad076..22f735c27 100644 --- a/src/fastertransformer/utils/CMakeLists.txt +++ b/src/fastertransformer/utils/CMakeLists.txt @@ -57,7 +57,7 @@ add_library(mpi_utils STATIC mpi_utils.cc) set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) if (BUILD_MULTI_GPU) - target_link_libraries(mpi_utils PUBLIC -lmpi logger) + target_link_libraries(mpi_utils PUBLIC -lmpi -lmpi_cxx logger) endif() add_library(nccl_utils STATIC nccl_utils.cc) From 48b35f76924cf89a7ec61d26462f0444f8a2f1f2 Mon Sep 17 00:00:00 2001 From: dypshong Date: Sun, 1 Oct 2023 10:25:54 +0000 Subject: [PATCH 54/55] ref --- .../LLaMAContextAttentionLayer.cc | 159 ++++++------------ .../LLaMAContextAttentionLayer.h | 8 +- src/fastertransformer/models/llama/LLaMA.cc | 11 +- src/fastertransformer/models/llama/LLaMA.h | 2 - .../models/llama/LLaMAContextDecoder.cc | 6 +- .../models/llama/LLaMAContextDecoder.h | 2 - src/fastertransformer/th_op/llama/LLaMA.cc | 4 +- src/fastertransformer/th_op/llama/LLaMA.h | 8 - 8 files changed, 60 insertions(+), 140 deletions(-) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index daf3d9178..209fed1ca 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -64,7 +64,7 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten "LLaMA Context FUSED_PADDED_MHA is not supported !"); PUSH_RANGE("attention buffer alloc"); - allocateBuffer(batch_size, seq_len, attn_len, max_seq_len, attention_type != AttentionType::FUSED_MHA); + allocateBuffer(batch_size, seq_len, attn_len); POP_RANGE; sync_check_cuda_error(); @@ -138,86 +138,51 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten const int attention_seq_len_1 = seq_len; // q length const int attention_seq_len_2 = attn_len; // kv length const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); + FT_CHECK(gemm_data_type != CUDA_R_32F); // // softmax(Q*K^T) // - if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { - PUSH_RANGE("Q*K batch gemm"); - - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, // n - attention_seq_len_1, // m - size_per_head_, // k - 1.0f, - k_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_2 * size_per_head_, // n * k - q_buf_2_, - gemm_data_type, - size_per_head_, // k - attention_seq_len_1 * size_per_head_, // m * k - 0.0f, - qk_buf_float_, - CUDA_R_32F, - attention_seq_len_2, // n - attention_seq_len_2 * attention_seq_len_1, - batch_size * head_num_, // global batch size - CUDA_R_32F); - sync_check_cuda_error(); - POP_RANGE; - - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } - else { - PUSH_RANGE("Q*K batch gemm"); - cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, - CUBLAS_OP_N, - attention_seq_len_2, - attention_seq_len_1, - size_per_head_, - k_buf_2_, - size_per_head_, - attention_seq_len_2 * size_per_head_, - q_buf_2_, - size_per_head_, - attention_seq_len_1 * size_per_head_, - qk_buf_, - attention_seq_len_2, - attention_seq_len_2 * attention_seq_len_1, - batch_size * head_num_); - - POP_RANGE; - PUSH_RANGE("softmax"); - MaskedSoftmaxParam param; - param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.qk = qk_buf_; // (batch_size, head_num, q_length, k_length) - param.attention_mask = attention_mask; // (batch_size, q_length, k_length) - param.batch_size = batch_size; - param.q_length = attention_seq_len_1; - param.k_length = attention_seq_len_2; - param.num_heads = head_num_; - param.qk_scale = qk_scale; - param.linear_bias_slopes = nullptr; - invokeMaskedSoftmax(param, stream_); - sync_check_cuda_error(); - POP_RANGE; - } + PUSH_RANGE("Q*K batch gemm"); + + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, // n + attention_seq_len_1, // m + size_per_head_, // k + 1.0f, + k_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_2 * size_per_head_, // n * k + q_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_1 * size_per_head_, // m * k + 0.0f, + qk_buf_float_, + CUDA_R_32F, + attention_seq_len_2, // n + attention_seq_len_2 * attention_seq_len_1, + batch_size * head_num_, // global batch size + CUDA_R_32F); + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; PUSH_RANGE("QK*V batch gemm"); cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, @@ -301,14 +266,12 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_ cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float): + bool is_free_buffer_after_forward): BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), head_num_(head_num), size_per_head_(size_per_head), hidden_units_(head_num * size_per_head), - rotary_embedding_dim_(rotary_embedding_dim), - is_qk_buf_float_(is_qk_buf_float) + rotary_embedding_dim_(rotary_embedding_dim) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); } @@ -322,8 +285,7 @@ LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionL head_num_(attention_layer.head_num_), size_per_head_(attention_layer.size_per_head_), hidden_units_(attention_layer.hidden_units_), - rotary_embedding_dim_(attention_layer.rotary_embedding_dim_), - is_qk_buf_float_(attention_layer.is_qk_buf_float_) + rotary_embedding_dim_(attention_layer.rotary_embedding_dim_) { } @@ -341,36 +303,22 @@ void LLaMAContextAttentionLayer::allocateBuffer() } template -void LLaMAContextAttentionLayer::allocateBuffer( - size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf) +void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); - q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false); + q_buf_2_ = + (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false); k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_; // save memory usage when using fmha - if (allocate_qk_buf) { - qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false); - } - else { - allocator_->free((void**)(&qk_buf_)); - qk_buf_ = nullptr; - } + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false); qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); - if (is_qk_buf_float_ == true) { - if (allocate_qk_buf) { - qk_buf_float_ = (float*)allocator_->reMalloc( - qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false); - } - else { - allocator_->free((void**)(&qk_buf_float_)); - qk_buf_float_ = nullptr; - } - } + qk_buf_float_ = + (float*)allocator_->reMalloc(qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false); is_allocate_buffer_ = true; } @@ -387,10 +335,7 @@ void LLaMAContextAttentionLayer::freeBuffer() allocator_->free((void**)(&qk_buf_)); allocator_->free((void**)(&qkv_buf_2_)); allocator_->free((void**)(&qkv_buf_3_)); - - if (is_qk_buf_float_ == true) { - allocator_->free((void**)(&qk_buf_float_)); - } + allocator_->free((void**)(&qk_buf_float_)); is_allocate_buffer_ = false; } diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h index 8d24689a8..504cc8aba 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -34,17 +34,14 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { const size_t rotary_embedding_dim_; // fmha runner - int sm_ = getSMVersion(); void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf); + void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len); void freeBuffer() override; using BaseAttentionLayer::is_free_buffer_after_forward_; using BaseAttentionLayer::is_allocate_buffer_; using BaseAttentionLayer::cublas_wrapper_; - bool is_qk_buf_float_; - protected: using BaseAttentionLayer::allocator_; using BaseAttentionLayer::stream_; @@ -65,8 +62,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer { cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, - bool is_free_buffer_after_forward, - bool is_qk_buf_float); + bool is_free_buffer_after_forward); LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer); diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 3884a79ac..052a9b2e2 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -39,7 +39,6 @@ void LLaMA::initialize() cublas_wrapper_, allocator_, is_free_buffer_after_forward_, - is_context_qk_buf_float_, attention_type_); } @@ -188,7 +187,6 @@ void LLaMA::forward(std::unordered_map* output_ten // output_tensors: // hidden_vector [num_tokens, hidden_size] - // log_probs [num_tokens, vocab_size] // cum_probs [beam_width, batch_size] FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7"); @@ -211,7 +209,6 @@ void LLaMA::forward(std::unordered_map* output_ten const int attn_len = input_tensors->at("attn_len").getVal(); const int is_context = input_tensors->at("is_context").getVal(); T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); - float* log_probs = output_tensors->at("log_probs").getPtr(); float* cum_probs = output_tensors->at("cum_probs").getPtr(); FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len"); @@ -311,11 +308,11 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); cublas_wrapper_->setFP16GemmConfig(); - invokeLLaMALogSoftmax(log_probs, logits_buf_, batch_size, vocab_size_, stream_); + invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, batch_size, vocab_size_, stream_); sync_check_cuda_error(); invokeLLaMAExtractTargets( - cum_probs, log_probs, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_); + cum_probs, log_likelihood_buf_, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_); sync_check_cuda_error(); } else { @@ -345,11 +342,11 @@ void LLaMA::forward(std::unordered_map* output_ten sync_check_cuda_error(); cublas_wrapper_->setFP16GemmConfig(); - invokeLLaMALogSoftmax(log_probs, logits_buf_, num_tokens, vocab_size_, stream_); + invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, num_tokens, vocab_size_, stream_); sync_check_cuda_error(); invokeLLaMAGatherTokens( - cum_probs, log_probs, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_); + cum_probs, log_likelihood_buf_, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_); sync_check_cuda_error(); } } diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h index 7be6120ab..117f87341 100644 --- a/src/fastertransformer/models/llama/LLaMA.h +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -42,8 +42,6 @@ class LLaMA: public BaseLayer { size_t world_size_; static constexpr float layernorm_eps_ = 1e-6f; AttentionType attention_type_; - const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr - || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); LLaMAContextDecoder* llama_context_decoder_; diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 1d5901c61..036921135 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -32,8 +32,7 @@ void LLaMAContextDecoder::initialize() stream_, cublas_wrapper_, allocator_, - is_free_buffer_after_forward_, - is_qk_buf_float_); + is_free_buffer_after_forward_); ffn_layer_ = new SiluFfnLayer(0, // max_batch_size 0, // max_seq_len @@ -121,7 +120,6 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, AttentionType attention_type): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), head_num_(head_num), @@ -133,7 +131,6 @@ LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, hidden_units_(head_num * size_per_head), rank_(rank), world_size_(world_size), - is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type) { initialize(); @@ -151,7 +148,6 @@ LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decode hidden_units_(decoder.hidden_units_), rank_(decoder.rank_), world_size_(decoder.world_size_), - is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_) { initialize(); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h index 94d960dab..3e2aeb0c0 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -46,7 +46,6 @@ class LLaMAContextDecoder: public BaseLayer { size_t rank_; size_t world_size_; AttentionType attention_type_; - bool is_qk_buf_float_; BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; @@ -80,7 +79,6 @@ class LLaMAContextDecoder: public BaseLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, - bool is_qk_buf_float, AttentionType attention_type = AttentionType::FUSED_MHA); LLaMAContextDecoder(LLaMAContextDecoder const& decoder); diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc index ef9cca14f..760ead92e 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.cc +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -74,7 +74,6 @@ LLaMA::~LLaMA() } std::vector LLaMA::forward(th::Tensor& hidden_vector, - th::Tensor& log_probs, th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, @@ -92,7 +91,6 @@ std::vector LLaMA::forward(th::Tensor& hidden_vector, TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); ftllama->forward(hidden_vector, - log_probs, cum_probs, input_ids, input_lengths, @@ -101,7 +99,7 @@ std::vector LLaMA::forward(th::Tensor& hidden_vector, seq_len, attn_len, is_context); - return std::vector{hidden_vector, log_probs, cum_probs}; + return std::vector{hidden_vector, cum_probs}; } } // namespace torch_ext diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h index db8b8aeec..425f260df 100755 --- a/src/fastertransformer/th_op/llama/LLaMA.h +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -30,7 +30,6 @@ class IFLLaMA { public: virtual ~IFLLaMA() {} virtual void forward(th::Tensor& hidden_vector, - th::Tensor& log_probs, th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, @@ -175,7 +174,6 @@ class FTLLaMA: public IFLLaMA { } virtual void forward(th::Tensor& hidden_vector, - th::Tensor& log_probs, th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, @@ -212,11 +210,6 @@ class FTLLaMA: public IFLLaMA { (std::is_same::value) ? ft::TYPE_FP16 : ft::TYPE_FP32, std::vector{num_tokens, num_heads_ * size_per_head_}, get_ptr(hidden_vector)}}, - {"log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{num_tokens, vocab_size_}, - get_ptr(log_probs)}}, {"cum_probs", ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, @@ -288,7 +281,6 @@ class LLaMA: public th::jit::CustomClassHolder { ~LLaMA(); std::vector forward(th::Tensor& hidden_vector, - th::Tensor& log_probs, th::Tensor& cum_probs, th::Tensor& input_ids, th::Tensor& input_lengths, From baca61bafecd25a11f8099bf3cbf778caaaac245 Mon Sep 17 00:00:00 2001 From: dypshong Date: Mon, 2 Oct 2023 03:14:12 +0000 Subject: [PATCH 55/55] final --- .../kernels/llama_kernels.cu | 39 +++++++++++++++++-- src/fastertransformer/kernels/llama_kernels.h | 1 + .../LLaMAContextAttentionLayer.cc | 4 +- src/fastertransformer/models/llama/LLaMA.cc | 4 +- .../models/llama/LLaMAContextDecoder.cc | 4 ++ 5 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu index d6d119227..4b02602d9 100644 --- a/src/fastertransformer/kernels/llama_kernels.cu +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -41,9 +41,19 @@ template void invokeLLaMAGetLastTokens( float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); template void invokeLLaMAGetLastTokens( half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); - -__global__ void LLaMA_extract_targets( - float* out, float* in, const int* target_ids, const int* cu_seqlens, int beam_width, int batch_size, int vocab_size, int num_tokens) +#ifdef ENABLE_BF16 +template void invokeLLaMAGetLastTokens( + __nv_bfloat16* out, __nv_bfloat16* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); +#endif + +__global__ void LLaMA_extract_targets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens) { // in [batch_size, vocab_size] // target_ids [ beam_width, num_tokens ] @@ -211,6 +221,14 @@ template void invokeLLaMAInputIdsEmbeddingLookup(half* out, const int num_tokens, const int hidden_units, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAInputIdsEmbeddingLookup(__nv_bfloat16* out, + const __nv_bfloat16* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); +#endif __global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel( int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len) @@ -299,6 +317,15 @@ template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, const int seq_len, const int attn_len, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMABuildDecoderAttentionMask(__nv_bfloat16* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len, + cudaStream_t stream); +#endif template __global__ void LLaMACopyKernel(T* dst, T* src, const int count) @@ -326,6 +353,9 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream) template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream); template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMACopyKernel(__nv_bfloat16* dst, __nv_bfloat16* src, const int count, cudaStream_t stream); +#endif template __global__ void LLaMAMemset0Kernel(T* dst, const int count) @@ -352,5 +382,8 @@ void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream) template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream); template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAMemset0(__nv_bfloat16* dst, const int count, cudaStream_t stream); +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h index f0d356a09..01e3bbf7a 100644 --- a/src/fastertransformer/kernels/llama_kernels.h +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -29,6 +29,7 @@ void invokeLLaMAInputIdsEmbeddingLookup(T* from_tensor, template void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); + template void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream); diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc index 209fed1ca..28c0f6f55 100644 --- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -18,6 +18,7 @@ #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" #include "src/fastertransformer/kernels/layernorm_kernels.h" #include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/kernels/llama_kernels.h" #include "src/fastertransformer/utils/llama_utils.h" #include "src/fastertransformer/utils/nvtx_utils.h" @@ -87,7 +88,8 @@ void LLaMAContextAttentionLayer::forward(TensorMap* output_ten if (padding_offset != nullptr) { // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous - cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_); + //cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_); + invokeLLaMAMemset0(q_buf_2_, batch_size * (seq_len + 2 * attn_len) * hidden_units_, stream_); sync_check_cuda_error(); } diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc index 052a9b2e2..1cc8a95c4 100644 --- a/src/fastertransformer/models/llama/LLaMA.cc +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -248,7 +248,9 @@ void LLaMA::forward(std::unordered_map* output_ten {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}}, {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}}, - {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}}; + {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}, + {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}}, + }; if (is_unpadded_mha) { decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}}); diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc index 036921135..7406c838f 100644 --- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -189,6 +189,8 @@ void LLaMAContextDecoder::forward(std::unordered_map* // input_lengths [batch_size] // context_lengths [batch_size] // seq_len [1] int on cpu + // attn_len [1] int on cpu + // is_context [1] int on cpu // padding_offset [batch_size] int on cpu // cu_seqlens [batch_size+1] int on cpu @@ -208,6 +210,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* const int* context_lengths = input_tensors->at("context_lengths").getPtr(); const int seq_len = input_tensors->at("attention_mask").shape[2]; const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int is_context = input_tensors->at("is_context").getVal(); const int* padding_offset = nullptr; const int* cu_seqlens = nullptr; if (is_unpadded_mha) { @@ -267,6 +270,7 @@ void LLaMAContextDecoder::forward(std::unordered_map* attention_mask}}, {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, + {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}}, }; if (is_unpadded_mha) {