From bf93524af782db46ecc5a350f8e8145f98b0a71f Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 11 Sep 2023 08:28:28 +0000
Subject: [PATCH 01/55] fix readme

---
 FasterTransformerReadME.md | 417 ++++++++++++++++++++++++++++++++++++
 README.md                  | 418 +------------------------------------
 2 files changed, 425 insertions(+), 410 deletions(-)
 create mode 100644 FasterTransformerReadME.md

diff --git a/FasterTransformerReadME.md b/FasterTransformerReadME.md
new file mode 100644
index 000000000..a00e0d631
--- /dev/null
+++ b/FasterTransformerReadME.md
@@ -0,0 +1,417 @@
+# FasterTransformer
+
+This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [FasterTransformer](#fastertransformer)
+  - [Table Of Contents](#table-of-contents)
+  - [Model overview](#model-overview)
+    - [Support matrix](#support-matrix)
+  - [Advanced](#advanced)
+    - [Global Environment](#global-environment)
+  - [Performance](#performance)
+    - [BERT base performance](#bert-base-performance)
+      - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features)
+      - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow)
+      - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch)
+    - [Decoding and Decoder performance](#decoding-and-decoder-performance)
+      - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)
+      - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)
+    - [GPT performance](#gpt-performance)
+  - [Release notes](#release-notes)
+    - [Changelog](#changelog)
+    - [Known issues](#known-issues)
+
+## Model overview
+
+In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16.
+
+FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks.
+
+### Support matrix
+
+| Models           | Framework      | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) |
+| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ |
+| BERT             | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| BERT             | PyTorch        | Yes  | Yes                 | Yes                     | Yes             | Yes               | -                  |
+| BERT             | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BERT             | C++            | Yes  | Yes                 | -                       | -               | -                 | Yes                |
+| XLNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
+| Encoder          | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| Encoder          | PyTorch        | Yes  | Yes                 | Yes                     | -               | -                 | -                  |
+| Decoder          | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoder          | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoding         | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoding         | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| GPT              | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| GPT/OPT          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | Yes                |
+| GPT/OPT          | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-MoE          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BLOOM            | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BLOOM            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-J            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| Longformer       | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| T5/UL2           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5               | TensorFlow 2   | Yes  | -                   | -                       | -               | -                 | -                  |
+| T5/UL2           | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5               | TensorRT       | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5-MoE           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| Swin Transformer | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BART/mBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
+| DeBERTa          | TensorFlow 2   | Yes  | -                   | -                       | On-going        | On-going          | -                  |
+| DeBERTa          | PyTorch        | Yes  | -                   | -                       | On-going        | On-going          | -                  |
+
+* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++.
+
+More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together.
+
+## Advanced
+
+The following code lists the directory structure of FasterTransformer:
+
+```
+/src/fastertransformer: source code of FasterTransformer
+    |--/cutlass_extensions: Implementation of cutlass gemm/kernels.
+    |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual.
+    |--/layers: Implementation of layer modules, like attention layer, ffn layer.
+    |--/models: Implementation of different models, like BERT, GPT.
+    |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin.
+    |--/tf_op: custom Tensorflow OP implementation
+    |--/th_op: custom PyTorch OP implementation
+    |--/triton_backend: custom triton backend implementation
+    |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils
+/examples: C++, tensorflow and pytorch interface examples
+    |--/cpp: C++ interface examples
+    |--/pytorch: PyTorch OP examples
+    |--/tensorflow: TensorFlow OP examples
+    |--/tensorrt: TensorRT examples
+/docs: Documents to explain the details of implementation of different models, and show the benchmark
+/benchmark: Contains the scripts to run the benchmarks of different models
+/tests: Unit tests
+/templates: Documents to explain how to add a new model/example into FasterTransformer repo
+```
+
+Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`.
+
+
+### Global Environment
+
+FasterTransformer provides some convenient environment variables for debuging and testing.
+
+1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow.
+2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program.
+3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging.
+
+## Performance
+
+Hardware settings:
+
+* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor
+* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
+
+In order to run the following benchmark, we need to install the unix computing tool "bc" by
+
+```bash
+apt-get install bc
+```
+
+### BERT base performance
+
+The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`.
+
+The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`.
+
+The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`.
+
+The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`.
+
+More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance).
+
+#### BERT base performances of FasterTransformer new features
+
+The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4.
+
+For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case.
+
+<div align=center><img  width=80% src ="docs/images/FT_Encoder_T4.png"/></div>
+
+#### BERT base performance on TensorFlow
+
+The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4.
+
+For small batch size and sequence length, using FasterTransformer can bring about 3x speedup.
+
+For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
+
+<div align=center><img  width=80% src ="docs/images/TF_Encoder_T4.png"/></div>
+
+#### BERT base performance on PyTorch
+
+The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4.
+
+For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup.
+
+For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
+
+<div align=center><img  width=80% src ="docs/images/Py_Encoder_T4.png"/></div>
+
+### Decoding and Decoder performance
+
+The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh`
+
+The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`.
+
+In the experiments of decoding, we updated the following parameters:
+
+* head_num = 8
+* size_per_head = 64
+* num_layers = 6 for both encoder and decoder
+* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes
+* memory_hidden_dim = 512
+* max sequenc elength = 128
+
+More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance).
+
+#### Decoder and Decoding end-to-end translation performance on TensorFlow
+
+The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup.
+
+<div align=center><img  width=80% src ="docs/images/TF_Decoder_T4.png"/></div>
+
+#### Decoder and Decoding end-to-end translation performance on PyTorch
+
+The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup.
+
+<div align=center><img  width=80% src ="docs/images/Py_Decoder_T4.png"/></div>
+
+### GPT performance
+
+The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100.
+
+In the experiments of decoding, we updated the following parameters:
+
+* head_num = 96
+* size_per_head = 128
+* num_layers = 48 for GPT-89B model, 96 for GPT-175B model
+* data_type = FP16
+* vocab_size = 51200
+* top_p = 0.9
+* tensor parallel size = 8
+* input sequence length = 512
+* output sequence length = 32
+
+<div align=center><img  width=80% src ="docs/images/FT_GPT_A100.png"/></div>
+
+## Release notes
+
+### Changelog
+
+May 2023
+- Fix bugs of generation early stopping
+
+January 2023
+- Support GPT MoE
+- Support FP8 for Bert and GPT (**Experimental**)
+- Support DeBERTa on TensorFlow 2 and PyTorch
+
+Dec 2022
+- **Release the FasterTransformer 5.2**
+- Support min length penalty
+
+Nov 2022
+- Support T5 Tensorflow 2 custom op.
+- Support T5 MoE
+- Support WeNet
+- Support BART & mBART
+- Support SwinV2
+- Initial support for w8a8 int8 mode with GPT (preview)
+- Support fused mha in GPT
+
+Oct 2022
+- Support BLOOM
+
+Sep 2022
+- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt
+- Support for IA3 adapting scheme in T5
+
+Aug 2022
+- Support returning context tokens embeddings in GPT
+- **Release the FasterTransformer 5.1**
+- Support for interactive generation
+- Support for attention time-limited memory
+- Support mt5 and t5-v1.1
+
+July 2022
+- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2))
+  - Fix bug of T5 under bfloat16.
+- Add ViT INT8 TensorRT Plugin
+- Support batch sampling
+- Support shared context optimization in GPT model
+
+June 2022
+- Support streaming generation for triton backend.
+- Support OPT.
+- Support multi-node multi-GPU BERT under FP32, FP16 and BF16.
+
+May 2022
+- Support bfloat16 on most models.
+- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J.
+- Support GPT-NeoX.
+  - epsilon value used in layernorm is now a parameter
+  - rotary embedding GPT-NeoX style (only GPT-J was implemented)
+  - load per-GPU layernorm and bias parameters
+  - weight conversion from EleutherAI checkpoint
+
+April 2022
+- **Release the FasterTransformer 5.0**
+  - Change the default accumulation type of all gemm to FP32.
+  - Support bfloat16 inference in GPT model.
+  - Support Nemo Megatron T5 and Megatron-LM T5 model.
+  - Support ViT.
+
+March 2022
+- Support `stop_ids` and `ban_bad_ids` in GPT-J.
+- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding.
+
+February 2022
+- Support Swin Transformer.
+- Optimize the k/v cache update of beam search by in-direction buffer.
+- Support runtime input for GPT-J, T5 and GPT.
+- Support soft prompt in GPT and GPT-J.
+- Support custom all reduce kernel.
+  - Limitation: 
+    1. Only support tensor parallel size = 8 on DGX-A100.
+    2. Only support CUDA with cudaMallocAsync.
+
+December 2021
+- Add TensorRT plugin of T5 model.
+- Change some hyper-parameters of GPT model to runtime query.
+- Optimize the memory allocator under C++ code.
+- Fix bug of CUB including when using CUDA 11.5 or newer version.
+
+November 2021
+- **Update the FasterTransformer 5.0 beta**
+- Add GPT-3 INT8 weight only qauntization for batch size <= 2.
+- Support multi-node multi-gpu support on T5.
+- Enhance the multi-node multi-gpu supporting in GPT-3.
+
+August 2021
+- **Release the FasterTransformer 5.0 beta**
+  - Refactor the repo and codes
+  - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below.
+    - Bugs fix
+      - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper.
+      - Fix memory leak that occurs every forward because of reused allocator.
+      - Fix race condition that occurs in repetition penalty kernel.
+    - Enhancement
+      - Add random seed setting.
+      - Fix GEMM buffer overflow on FP16 of GPT.
+      - Change to invalidate finished buffer for every completion.
+      - Introduce stop_before for early stop.
+  - Support Longformer.
+  - Rename `layer_para` to `pipeline_para`.
+  - Optimize the sorting of top p sampling.
+  - Support sparsity for Ampere GPUs on BERT.
+  - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model.
+  - Support multi-node inference for GPT Triton backend.
+
+June 2021
+- Support XLNet
+
+April 2021
+- **Release the FasterTransformer 4.0**
+  - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch.
+  - Support single node, multi-gpus inference for GPT model on triton.
+  - Add the int8 fused multi-head attention kernel for bert.
+  - Add the FP16 fused multi-head attention kernel of V100 for bert.
+  - Optimize the kernel of decoder.
+  - Move to independent repo.
+  - Eager mode PyTorch extension is deprecated.
+
+Dec 2020
+- **Release the FasterTransformer 3.1**
+  - Optimize the decoding by adding the finisehd mask to prevent useless computing.
+  - Support opennmt encoder.
+  - Remove the TensorRT plugin supporting.
+  - TorchScript custom op is deprecated.
+
+Nov 2020
+- Optimize the INT8 inference.
+- Support PyTorch INT8 inference.
+- Provide PyTorch INT8 quantiztion tools.
+- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
+- Add unit test of SQuAD.
+- Update the missed NGC checkpoints.
+
+Sep 2020
+- Support GPT2
+- **Release the FasterTransformer 3.0**
+  - Support INT8 quantization of encoder of cpp and TensorFlow op.
+  - Add bert-tf-quantization tool.
+  - Fix the issue that Cmake 15 or Cmake 16 fail to build this project.
+
+Aug 2020
+- Fix the bug of trt plugin.
+
+June 2020
+- **Release the FasterTransformer 2.1**
+  - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea.
+  - Optimize the beam search kernels.
+  - Add PyTorch op supporting
+
+May 2020
+- Fix the bug that seq_len of encoder must be larger than 3.
+- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table.
+- Modifying the method of loading model in `translate_sample.py`.
+
+April 2020
+- Rename `decoding_opennmt.h` to `decoding_beamsearch.h`
+- Add DiverseSiblingsSearch for decoding.
+- Add sampling into Decoding
+  - The implementation is in the `decoding_sampling.h`
+  - Add top_k sampling, top_p sampling for decoding.
+- Refactor the tensorflow custom op codes.
+  - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc`
+  - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc`
+  - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc`
+- Fix the bugs of finalize function decoding.py.
+- Fix the bug of tf DiverseSiblingSearch.
+- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3.
+- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder.
+- Add dynamic batch size and dynamic sequence length features into all ops.
+
+March 2020
+- Add feature in FasterTransformer 2.0
+  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
+- Fix bugs of Fastertransformer 2.0
+  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
+  - Fix the bug that decoding does not check finish or not after each step.
+  - Fix the bug of decoder about max_seq_len.
+  - Modify the decoding model structure to fit the OpenNMT-tf decoding model.
+    - Add a layer normalization layer after decoder.
+    - Add a normalization for inputs of decoder
+
+February 2020
+- **Release the FasterTransformer 2.0**
+  - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op.
+  - Refine the sample codes of encoder.
+  - Add dynamic batch size feature into encoder op.
+
+July 2019
+- **Release the FasterTransformer 1.0**
+  - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin.
+
+### Known issues
+
+- Cannot compile on tensorflow 2.10 due to undefined symbol issue.
+- Undefined symbol errors when import the extension
+  - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc.
+- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem.
+- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14.
diff --git a/README.md b/README.md
index a00e0d631..50f50cab2 100644
--- a/README.md
+++ b/README.md
@@ -1,417 +1,15 @@
-# FasterTransformer
+# FasterTransformer for SaumsungCEChallenge
 
-This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.
+Check out FasterTransformer [README.md](FasterTransformerReadME.md)
 
-## Table Of Contents
+## Installation
 
-- [FasterTransformer](#fastertransformer)
-  - [Table Of Contents](#table-of-contents)
-  - [Model overview](#model-overview)
-    - [Support matrix](#support-matrix)
-  - [Advanced](#advanced)
-    - [Global Environment](#global-environment)
-  - [Performance](#performance)
-    - [BERT base performance](#bert-base-performance)
-      - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features)
-      - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow)
-      - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch)
-    - [Decoding and Decoder performance](#decoding-and-decoder-performance)
-      - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)
-      - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)
-    - [GPT performance](#gpt-performance)
-  - [Release notes](#release-notes)
-    - [Changelog](#changelog)
-    - [Known issues](#known-issues)
-
-## Model overview
-
-In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16.
-
-FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks.
-
-### Support matrix
-
-| Models           | Framework      | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) |
-| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ |
-| BERT             | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| BERT             | PyTorch        | Yes  | Yes                 | Yes                     | Yes             | Yes               | -                  |
-| BERT             | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BERT             | C++            | Yes  | Yes                 | -                       | -               | -                 | Yes                |
-| XLNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
-| Encoder          | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| Encoder          | PyTorch        | Yes  | Yes                 | Yes                     | -               | -                 | -                  |
-| Decoder          | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoder          | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoding         | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoding         | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| GPT              | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| GPT/OPT          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | Yes                |
-| GPT/OPT          | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-MoE          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BLOOM            | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BLOOM            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-J            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| Longformer       | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| T5/UL2           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5               | TensorFlow 2   | Yes  | -                   | -                       | -               | -                 | -                  |
-| T5/UL2           | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5               | TensorRT       | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5-MoE           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| Swin Transformer | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BART/mBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
-| DeBERTa          | TensorFlow 2   | Yes  | -                   | -                       | On-going        | On-going          | -                  |
-| DeBERTa          | PyTorch        | Yes  | -                   | -                       | On-going        | On-going          | -                  |
-
-* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++.
-
-More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together.
-
-## Advanced
-
-The following code lists the directory structure of FasterTransformer:
 
 ```
-/src/fastertransformer: source code of FasterTransformer
-    |--/cutlass_extensions: Implementation of cutlass gemm/kernels.
-    |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual.
-    |--/layers: Implementation of layer modules, like attention layer, ffn layer.
-    |--/models: Implementation of different models, like BERT, GPT.
-    |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin.
-    |--/tf_op: custom Tensorflow OP implementation
-    |--/th_op: custom PyTorch OP implementation
-    |--/triton_backend: custom triton backend implementation
-    |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils
-/examples: C++, tensorflow and pytorch interface examples
-    |--/cpp: C++ interface examples
-    |--/pytorch: PyTorch OP examples
-    |--/tensorflow: TensorFlow OP examples
-    |--/tensorrt: TensorRT examples
-/docs: Documents to explain the details of implementation of different models, and show the benchmark
-/benchmark: Contains the scripts to run the benchmarks of different models
-/tests: Unit tests
-/templates: Documents to explain how to add a new model/example into FasterTransformer repo
-```
-
-Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`.
-
-
-### Global Environment
-
-FasterTransformer provides some convenient environment variables for debuging and testing.
-
-1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow.
-2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program.
-3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging.
-
-## Performance
-
-Hardware settings:
-
-* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor
-* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
-
-In order to run the following benchmark, we need to install the unix computing tool "bc" by
-
-```bash
-apt-get install bc
+mkdir -p FasterTransformer/build
+cd FasterTransformer/build
+git submodule init && git submodule update
+cmake -DSM=xx -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .
+make -j32
 ```
 
-### BERT base performance
-
-The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`.
-
-The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`.
-
-The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`.
-
-The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`.
-
-More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance).
-
-#### BERT base performances of FasterTransformer new features
-
-The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4.
-
-For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case.
-
-<div align=center><img  width=80% src ="docs/images/FT_Encoder_T4.png"/></div>
-
-#### BERT base performance on TensorFlow
-
-The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4.
-
-For small batch size and sequence length, using FasterTransformer can bring about 3x speedup.
-
-For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
-
-<div align=center><img  width=80% src ="docs/images/TF_Encoder_T4.png"/></div>
-
-#### BERT base performance on PyTorch
-
-The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4.
-
-For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup.
-
-For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
-
-<div align=center><img  width=80% src ="docs/images/Py_Encoder_T4.png"/></div>
-
-### Decoding and Decoder performance
-
-The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh`
-
-The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`.
-
-In the experiments of decoding, we updated the following parameters:
-
-* head_num = 8
-* size_per_head = 64
-* num_layers = 6 for both encoder and decoder
-* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes
-* memory_hidden_dim = 512
-* max sequenc elength = 128
-
-More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance).
-
-#### Decoder and Decoding end-to-end translation performance on TensorFlow
-
-The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup.
-
-<div align=center><img  width=80% src ="docs/images/TF_Decoder_T4.png"/></div>
-
-#### Decoder and Decoding end-to-end translation performance on PyTorch
-
-The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup.
-
-<div align=center><img  width=80% src ="docs/images/Py_Decoder_T4.png"/></div>
-
-### GPT performance
-
-The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100.
-
-In the experiments of decoding, we updated the following parameters:
-
-* head_num = 96
-* size_per_head = 128
-* num_layers = 48 for GPT-89B model, 96 for GPT-175B model
-* data_type = FP16
-* vocab_size = 51200
-* top_p = 0.9
-* tensor parallel size = 8
-* input sequence length = 512
-* output sequence length = 32
-
-<div align=center><img  width=80% src ="docs/images/FT_GPT_A100.png"/></div>
-
-## Release notes
-
-### Changelog
-
-May 2023
-- Fix bugs of generation early stopping
-
-January 2023
-- Support GPT MoE
-- Support FP8 for Bert and GPT (**Experimental**)
-- Support DeBERTa on TensorFlow 2 and PyTorch
-
-Dec 2022
-- **Release the FasterTransformer 5.2**
-- Support min length penalty
-
-Nov 2022
-- Support T5 Tensorflow 2 custom op.
-- Support T5 MoE
-- Support WeNet
-- Support BART & mBART
-- Support SwinV2
-- Initial support for w8a8 int8 mode with GPT (preview)
-- Support fused mha in GPT
-
-Oct 2022
-- Support BLOOM
-
-Sep 2022
-- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt
-- Support for IA3 adapting scheme in T5
-
-Aug 2022
-- Support returning context tokens embeddings in GPT
-- **Release the FasterTransformer 5.1**
-- Support for interactive generation
-- Support for attention time-limited memory
-- Support mt5 and t5-v1.1
-
-July 2022
-- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2))
-  - Fix bug of T5 under bfloat16.
-- Add ViT INT8 TensorRT Plugin
-- Support batch sampling
-- Support shared context optimization in GPT model
-
-June 2022
-- Support streaming generation for triton backend.
-- Support OPT.
-- Support multi-node multi-GPU BERT under FP32, FP16 and BF16.
-
-May 2022
-- Support bfloat16 on most models.
-- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J.
-- Support GPT-NeoX.
-  - epsilon value used in layernorm is now a parameter
-  - rotary embedding GPT-NeoX style (only GPT-J was implemented)
-  - load per-GPU layernorm and bias parameters
-  - weight conversion from EleutherAI checkpoint
-
-April 2022
-- **Release the FasterTransformer 5.0**
-  - Change the default accumulation type of all gemm to FP32.
-  - Support bfloat16 inference in GPT model.
-  - Support Nemo Megatron T5 and Megatron-LM T5 model.
-  - Support ViT.
-
-March 2022
-- Support `stop_ids` and `ban_bad_ids` in GPT-J.
-- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding.
-
-February 2022
-- Support Swin Transformer.
-- Optimize the k/v cache update of beam search by in-direction buffer.
-- Support runtime input for GPT-J, T5 and GPT.
-- Support soft prompt in GPT and GPT-J.
-- Support custom all reduce kernel.
-  - Limitation: 
-    1. Only support tensor parallel size = 8 on DGX-A100.
-    2. Only support CUDA with cudaMallocAsync.
-
-December 2021
-- Add TensorRT plugin of T5 model.
-- Change some hyper-parameters of GPT model to runtime query.
-- Optimize the memory allocator under C++ code.
-- Fix bug of CUB including when using CUDA 11.5 or newer version.
-
-November 2021
-- **Update the FasterTransformer 5.0 beta**
-- Add GPT-3 INT8 weight only qauntization for batch size <= 2.
-- Support multi-node multi-gpu support on T5.
-- Enhance the multi-node multi-gpu supporting in GPT-3.
-
-August 2021
-- **Release the FasterTransformer 5.0 beta**
-  - Refactor the repo and codes
-  - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below.
-    - Bugs fix
-      - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper.
-      - Fix memory leak that occurs every forward because of reused allocator.
-      - Fix race condition that occurs in repetition penalty kernel.
-    - Enhancement
-      - Add random seed setting.
-      - Fix GEMM buffer overflow on FP16 of GPT.
-      - Change to invalidate finished buffer for every completion.
-      - Introduce stop_before for early stop.
-  - Support Longformer.
-  - Rename `layer_para` to `pipeline_para`.
-  - Optimize the sorting of top p sampling.
-  - Support sparsity for Ampere GPUs on BERT.
-  - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model.
-  - Support multi-node inference for GPT Triton backend.
-
-June 2021
-- Support XLNet
-
-April 2021
-- **Release the FasterTransformer 4.0**
-  - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch.
-  - Support single node, multi-gpus inference for GPT model on triton.
-  - Add the int8 fused multi-head attention kernel for bert.
-  - Add the FP16 fused multi-head attention kernel of V100 for bert.
-  - Optimize the kernel of decoder.
-  - Move to independent repo.
-  - Eager mode PyTorch extension is deprecated.
-
-Dec 2020
-- **Release the FasterTransformer 3.1**
-  - Optimize the decoding by adding the finisehd mask to prevent useless computing.
-  - Support opennmt encoder.
-  - Remove the TensorRT plugin supporting.
-  - TorchScript custom op is deprecated.
-
-Nov 2020
-- Optimize the INT8 inference.
-- Support PyTorch INT8 inference.
-- Provide PyTorch INT8 quantiztion tools.
-- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
-- Add unit test of SQuAD.
-- Update the missed NGC checkpoints.
-
-Sep 2020
-- Support GPT2
-- **Release the FasterTransformer 3.0**
-  - Support INT8 quantization of encoder of cpp and TensorFlow op.
-  - Add bert-tf-quantization tool.
-  - Fix the issue that Cmake 15 or Cmake 16 fail to build this project.
-
-Aug 2020
-- Fix the bug of trt plugin.
-
-June 2020
-- **Release the FasterTransformer 2.1**
-  - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea.
-  - Optimize the beam search kernels.
-  - Add PyTorch op supporting
-
-May 2020
-- Fix the bug that seq_len of encoder must be larger than 3.
-- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table.
-- Modifying the method of loading model in `translate_sample.py`.
-
-April 2020
-- Rename `decoding_opennmt.h` to `decoding_beamsearch.h`
-- Add DiverseSiblingsSearch for decoding.
-- Add sampling into Decoding
-  - The implementation is in the `decoding_sampling.h`
-  - Add top_k sampling, top_p sampling for decoding.
-- Refactor the tensorflow custom op codes.
-  - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc`
-  - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc`
-  - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc`
-- Fix the bugs of finalize function decoding.py.
-- Fix the bug of tf DiverseSiblingSearch.
-- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3.
-- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder.
-- Add dynamic batch size and dynamic sequence length features into all ops.
-
-March 2020
-- Add feature in FasterTransformer 2.0
-  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
-- Fix bugs of Fastertransformer 2.0
-  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
-  - Fix the bug that decoding does not check finish or not after each step.
-  - Fix the bug of decoder about max_seq_len.
-  - Modify the decoding model structure to fit the OpenNMT-tf decoding model.
-    - Add a layer normalization layer after decoder.
-    - Add a normalization for inputs of decoder
-
-February 2020
-- **Release the FasterTransformer 2.0**
-  - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op.
-  - Refine the sample codes of encoder.
-  - Add dynamic batch size feature into encoder op.
-
-July 2019
-- **Release the FasterTransformer 1.0**
-  - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin.
-
-### Known issues
-
-- Cannot compile on tensorflow 2.10 due to undefined symbol issue.
-- Undefined symbol errors when import the extension
-  - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc.
-- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem.
-- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14.

From 6bbba8602919f6234bb6133a0381d43a5ab746a0 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 11 Sep 2023 08:59:12 +0000
Subject: [PATCH 02/55] add lamma template

---
 src/fastertransformer/models/CMakeLists.txt   |    1 +
 .../models/llama/CMakeLists.txt               |   69 +
 src/fastertransformer/models/llama/LLaMA.cc   | 1211 +++++++++++++++++
 src/fastertransformer/models/llama/LLaMA.h    |  218 +++
 .../models/llama/LLaMAContextDecoder.cc       |  514 +++++++
 .../models/llama/LLaMAContextDecoder.h        |  117 ++
 .../models/llama/LLaMADecoder.cc              |  391 ++++++
 .../models/llama/LLaMADecoder.h               |  104 ++
 .../models/llama/LLaMADecoderLayerWeight.cc   |  220 +++
 .../models/llama/LLaMADecoderLayerWeight.h    |   62 +
 .../models/llama/LLaMAWeight.cc               |  302 ++++
 .../models/llama/LLaMAWeight.h                |  106 ++
 .../th_op/llama/CMakeLists.txt                |   17 +
 src/fastertransformer/th_op/llama/LLaMA.cc    |  164 +++
 src/fastertransformer/th_op/llama/LLaMA.h     |  346 +++++
 15 files changed, 3842 insertions(+)
 create mode 100644 src/fastertransformer/models/llama/CMakeLists.txt
 create mode 100644 src/fastertransformer/models/llama/LLaMA.cc
 create mode 100644 src/fastertransformer/models/llama/LLaMA.h
 create mode 100644 src/fastertransformer/models/llama/LLaMAContextDecoder.cc
 create mode 100644 src/fastertransformer/models/llama/LLaMAContextDecoder.h
 create mode 100644 src/fastertransformer/models/llama/LLaMADecoder.cc
 create mode 100644 src/fastertransformer/models/llama/LLaMADecoder.h
 create mode 100644 src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
 create mode 100644 src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
 create mode 100644 src/fastertransformer/models/llama/LLaMAWeight.cc
 create mode 100644 src/fastertransformer/models/llama/LLaMAWeight.h
 create mode 100755 src/fastertransformer/th_op/llama/CMakeLists.txt
 create mode 100755 src/fastertransformer/th_op/llama/LLaMA.cc
 create mode 100755 src/fastertransformer/th_op/llama/LLaMA.h

diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt
index 248b4af3d..afc4f8b7b 100644
--- a/src/fastertransformer/models/CMakeLists.txt
+++ b/src/fastertransformer/models/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(t5)
 add_subdirectory(bart)
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
+add_subdirectory(llama)
 add_subdirectory(multi_gpu_gpt)
 if(ENABLE_FP8)
 add_subdirectory(gpt_fp8)
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
new file mode 100644
index 000000000..da314ec7d
--- /dev/null
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -0,0 +1,69 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(LLaMADecoderLayerWeight STATIC LLaMADecoderLayerWeight.cc)
+set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger)
+
+add_library(LLaMADecoder STATIC LLaMADecoder.cc)
+set_property(TARGET LLaMADecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMADecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMADecoder PUBLIC -lcudart cublasMMWrapper
+                      TensorParallelDecoderSelfAttentionLayer
+                      TensorParallelGeluFfnLayer
+                      layernorm_kernels
+                      add_residual_kernels
+                      LLaMADecoderLayerWeight
+                      tensor
+                      nccl_utils
+                      cuda_utils
+                      logger)
+
+add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc)
+set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper
+                      TensorParallelGptContextAttentionLayer
+                      TensorParallelGeluFfnLayer
+                      layernorm_kernels
+                      add_residual_kernels
+                      gpt_kernels
+                      tensor
+                      nccl_utils
+                      cuda_utils
+                      logger)
+
+add_library(LLaMAWeight STATIC LLaMAWeight.cc)
+set_property(TARGET LLaMAWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAWeight PUBLIC LLaMADecoderLayerWeight cuda_utils logger)
+
+add_library(LLaMA STATIC LLaMA.cc)
+set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMA PUBLIC -lcudart
+                      LLaMADecoder
+                      LLaMAContextDecoder
+                      decoding_kernels
+                      gpt_kernels
+                      DynamicDecodeLayer
+                      BaseBeamSearchLayer
+                      bert_preprocess_kernels
+                      tensor
+                      LLaMAWeight
+                      cuda_utils
+                      logger)
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
new file mode 100644
index 000000000..2ce2dae7b
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -0,0 +1,1211 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoX.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include <algorithm>
+
+namespace fastertransformer {
+
+template<typename T>
+void GptNeoX<T>::initialize()
+{
+    gpt_context_decoder_ = new GptNeoXContextDecoder<T>(head_num_,
+                                                        size_per_head_,
+                                                        inter_size_,
+                                                        num_layer_,
+                                                        rotary_embedding_dim_,
+                                                        neox_rotary_style_,
+                                                        use_gptj_residual_,
+                                                        layernorm_eps_,
+                                                        tensor_para_,
+                                                        pipeline_para_,
+                                                        stream_,
+                                                        cublas_wrapper_,
+                                                        allocator_,
+                                                        is_free_buffer_after_forward_,
+                                                        is_context_qk_buf_float_,
+                                                        attention_type_,
+                                                        custom_all_reduce_comm_,
+                                                        enable_custom_all_reduce_);
+
+    gpt_decoder_ = new GptNeoXDecoder<T>(head_num_,
+                                         size_per_head_,
+                                         inter_size_,
+                                         num_layer_,
+                                         rotary_embedding_dim_,
+                                         neox_rotary_style_,
+                                         use_gptj_residual_,
+                                         layernorm_eps_,
+                                         tensor_para_,
+                                         pipeline_para_,
+                                         stream_,
+                                         cublas_wrapper_,
+                                         allocator_,
+                                         is_free_buffer_after_forward_,
+                                         custom_all_reduce_comm_,
+                                         enable_custom_all_reduce_);
+
+    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
+                                                          vocab_size_padded_,
+                                                          0,  // end_id, deprecated
+                                                          stream_,
+                                                          cublas_wrapper_,
+                                                          allocator_,
+                                                          is_free_buffer_after_forward_,
+                                                          cuda_device_prop_);
+}
+
+template<typename T>
+void GptNeoX<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void GptNeoX<T>::allocateBuffer(
+    size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const size_t batchxbeam      = batch_size * beam_width;
+    const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len
+                                   * hidden_units_ / tensor_para_.world_size_;
+
+    if (vocab_size_ != vocab_size_padded_) {
+        padded_embedding_kernel_ =
+            (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true));
+        padded_embedding_kernel_ptr_ = padded_embedding_kernel_;
+
+        padded_embedding_bias_ =
+            (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true));
+    }
+
+    input_attention_mask_ = (T*)(allocator_->reMalloc(
+        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
+    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    nccl_logits_buf_ =
+        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
+    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
+    h_finished_buf_   = new bool[batchxbeam];
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+
+    key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
+    value_cache_ = key_cache_ + self_cache_size;
+    if (beam_width > 1) {
+        cache_indirections_[0] =
+            (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true));
+        cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len;
+    }
+
+    // prompt_learning weight batch ptrs
+    prompt_learning_weight_batch_ =
+        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
+    tiled_prompt_lengths_buf_ =
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+
+    tiled_input_ids_buf_ =
+        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
+    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
+    tiled_total_padding_count_ =
+        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
+
+    transposed_output_ids_buf_ =
+        (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
+    masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true));
+
+    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
+    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
+
+    context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
+        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+    context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
+        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+    output_log_probs_buf_ =
+        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
+
+    generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void GptNeoX<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        if (vocab_size_ != vocab_size_padded_) {
+            padded_embedding_kernel_ptr_ = nullptr;
+            allocator_->free((void**)(&padded_embedding_kernel_));
+            allocator_->free((void**)(&padded_embedding_bias_));
+        }
+
+        allocator_->free((void**)(&input_attention_mask_));
+        allocator_->free((void**)(&decoder_input_buf_));
+        allocator_->free((void**)(&decoder_output_buf_));
+        allocator_->free((void**)(&normed_decoder_output_buf_));
+        allocator_->free((void**)(&logits_buf_));
+        allocator_->free((void**)(&nccl_logits_buf_));
+        allocator_->free((void**)(&cum_log_probs_));
+        allocator_->free((void**)(&finished_buf_));
+        delete[] h_finished_buf_;
+        allocator_->free((void**)(&sequence_lengths_));
+
+        allocator_->free((void**)(&key_cache_));
+        if (cache_indirections_[0] != nullptr) {
+            allocator_->free((void**)(&cache_indirections_)[0]);
+        }
+
+        allocator_->free((void**)(&prompt_learning_weight_batch_));
+        allocator_->free((void**)(&tiled_prompt_lengths_buf_));
+
+        allocator_->free((void**)(&tiled_input_ids_buf_));
+        allocator_->free((void**)(&tiled_input_lengths_buf_));
+        allocator_->free((void**)(&tiled_total_padding_count_));
+
+        allocator_->free((void**)(&transposed_output_ids_buf_));
+        allocator_->free((void**)(&output_ids_buf_));
+        allocator_->free((void**)(&parent_ids_buf_));
+        allocator_->free((void**)(&seq_limit_len_));
+        allocator_->free((void**)(&masked_tokens_));
+
+        allocator_->free((void**)(&start_ids_buf_));
+        allocator_->free((void**)(&end_ids_buf_));
+
+        allocator_->free((void**)(&context_decoder_input_buf_));
+        allocator_->free((void**)(&context_decoder_output_buf_));
+        allocator_->free((void**)(&output_log_probs_buf_));
+
+        allocator_->free((void**)(&generation_should_stop_), true);
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+GptNeoX<T>::GptNeoX(size_t                              head_num,
+                    size_t                              size_per_head,
+                    size_t                              inter_size,
+                    size_t                              num_layer,
+                    size_t                              vocab_size,
+                    size_t                              rotary_embedding_dim,
+                    int                                 start_id,
+                    int                                 end_id,
+                    int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+                    PromptLearningType                  prompt_learning_type,
+                    bool                                use_gptj_residual,
+                    float                               beam_search_diversity_rate,
+                    size_t                              top_k,
+                    float                               top_p,
+                    unsigned long long                  random_seed,
+                    float                               temperature,
+                    float                               len_penalty,
+                    float                               repetition_penalty,
+                    cudaStream_t                        stream,
+                    cublasMMWrapper*                    cublas_wrapper,
+                    IAllocator*                         allocator,
+                    bool                                is_free_buffer_after_forward,
+                    cudaDeviceProp*                     cuda_device_prop,
+                    AttentionType                       attention_type,
+                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                    int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    start_id_(start_id),
+    end_id_(end_id),
+    prompt_learning_start_id_(prompt_learning_start_id),
+    prompt_learning_type_(prompt_learning_type),
+    use_gptj_residual_(use_gptj_residual),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(head_num / 1),
+    attention_type_(attention_type)
+{
+    tensor_para_.world_size_   = 1;
+    tensor_para_.rank_         = 0;
+    pipeline_para_.world_size_ = 1;
+    pipeline_para_.rank_       = 0;
+
+    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
+    if (std::is_same<half, T>::value) {
+        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
+    }
+    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
+    initialize();
+}
+
+template<typename T>
+GptNeoX<T>::GptNeoX(size_t                              head_num,
+                    size_t                              size_per_head,
+                    size_t                              inter_size,
+                    size_t                              num_layer,
+                    size_t                              vocab_size,
+                    size_t                              rotary_embedding_dim,
+                    int                                 start_id,
+                    int                                 end_id,
+                    int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+                    PromptLearningType                  prompt_learning_type,
+                    bool                                use_gptj_residual,
+                    float                               beam_search_diversity_rate,
+                    size_t                              top_k,
+                    float                               top_p,
+                    unsigned long long                  random_seed,
+                    float                               temperature,
+                    float                               len_penalty,
+                    float                               repetition_penalty,
+                    NcclParam                           tensor_para,
+                    NcclParam                           pipeline_para,
+                    cudaStream_t                        stream,
+                    cublasMMWrapper*                    cublas_wrapper,
+                    IAllocator*                         allocator,
+                    bool                                is_free_buffer_after_forward,
+                    cudaDeviceProp*                     cuda_device_prop,
+                    AttentionType                       attention_type,
+                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                    int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    start_id_(start_id),
+    end_id_(end_id),
+    prompt_learning_start_id_(prompt_learning_start_id),
+    prompt_learning_type_(prompt_learning_type),
+    use_gptj_residual_(use_gptj_residual),
+    hidden_units_(head_num * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    local_head_num_(head_num / tensor_para.world_size_),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    attention_type_(attention_type)
+{
+    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
+    if (std::is_same<half, T>::value) {
+        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
+    }
+    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
+    initialize();
+}
+
+template<typename T>
+GptNeoX<T>::GptNeoX(GptNeoX<T> const& gpt):
+    BaseLayer(gpt),
+    head_num_(gpt.head_num_),
+    size_per_head_(gpt.size_per_head_),
+    inter_size_(gpt.inter_size_),
+    num_layer_(gpt.num_layer_),
+    vocab_size_(gpt.vocab_size_),
+    rotary_embedding_dim_(gpt.rotary_embedding_dim_),
+    start_id_(gpt.start_id_),
+    end_id_(gpt.end_id_),
+    prompt_learning_start_id_(gpt.prompt_learning_start_id_),
+    prompt_learning_type_(gpt.prompt_learning_type_),
+    use_gptj_residual_(gpt.use_gptj_residual_),
+    hidden_units_(gpt.hidden_units_),
+    tensor_para_(gpt.tensor_para_),
+    pipeline_para_(gpt.pipeline_para_),
+    local_head_num_(gpt.local_head_num_),
+    vocab_size_padded_(gpt.vocab_size_padded_),
+    custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
+    attention_type_(gpt.attention_type_)
+{
+    initialize();
+}
+
+template<typename T>
+GptNeoX<T>::~GptNeoX()
+{
+    delete gpt_decoder_;
+    delete dynamic_decode_layer_;
+    delete gpt_context_decoder_;
+    freeBuffer();
+}
+
+template<typename T>
+void GptNeoX<T>::registerCallback(callback_sig* fn, void* ctx)
+{
+    token_generated_cb_  = fn;
+    token_generated_ctx_ = ctx;
+}
+
+template<typename T>
+void GptNeoX<T>::unRegisterCallback()
+{
+    token_generated_cb_  = nullptr;
+    token_generated_ctx_ = nullptr;
+}
+
+template<typename T>
+void GptNeoX<T>::forward(std::vector<Tensor>*       output_tensors,
+                         const std::vector<Tensor>* input_tensors,
+                         const GptNeoXWeight<T>*    gpt_weights)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         const GptNeoXWeight<T>*                        gpt_weights)
+{
+    // input_tensors:
+    //      input_ids [batch_size, max_input_length]
+    //      input_lengths [batch_size]
+    //      prompt_learning_task_name_ids [batch_size] on cpu, optional
+    //      output_seq_len [batch_size] on cpu
+    //      start_id [batch_size] on cpu, optional
+    //      end_id [batch_size] on cpu, optional
+    //      stop_words_list [batch_size, 2, stop_words_length], optional
+    //      bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
+    //      runtime_top_k [1] or [batch_size] on cpu, optional, uint.
+    //      runtime_top_p [1] or [batch_size] on cpu, optional, float.
+    //      beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float.
+    //      temperature [1] or [batch_size] on cpu, optional, float.
+    //      len_penalty [1] or [batch_size] on cpu, optional, float.
+    //      repetition_penalty [1] or [batch_size] on cpu, optional, float.
+    //      min_length [1] or [batch_size] on cpu, optional, int
+    //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
+    //      request_prompt_lengths [batch_size], optional
+    //      request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional
+    //      requst_prompt_type [batch_size], int, optional
+    //      top_p_decay [batch_size] on gpu, float, optional
+    //      top_p_min [batch_size] on gpu, float, optional
+    //      top_p_reset_ids [batch_size] on gpu, uint32, optional
+
+    // output_tensors:
+    //      output_ids [batch_size, beam_width, max_output_seq_len]
+    //      sequence_length [batch_size, beam_width]
+    //      output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*.
+    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
+    //      cum_log_probs [batch_size, beam], optional, must be float*.
+    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
+
+    // Step is from max_input_length ~ max_output_seq_len,
+    // When step = k,  we put output ids and caches at step k, and the sequence_length would be k - 1 before
+    // complete this step.
+    // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy
+    // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data
+
+    FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3");
+    FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2");
+    FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
+    FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
+    FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end()
+             && input_tensors->at("output_seq_len").shape.size() == 1);
+    FT_CHECK(output_tensors->at("output_ids").shape.size() == 3);
+    FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2);
+    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0],
+                       "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]");
+
+    const size_t batch_size = output_tensors->at("output_ids").shape[0];
+    const size_t beam_width = output_tensors->at("output_ids").shape[1];
+
+    PromptLearningType request_prompt_type = PromptLearningType::no_prompt;
+    int                valid_prompt_inputs = input_tensors->count("request_prompt_type")
+                              + input_tensors->count("request_prompt_lengths")
+                              + input_tensors->count("request_prompt_embedding");
+
+    if (valid_prompt_inputs == 3) {
+        request_prompt_type = static_cast<PromptLearningType>(input_tensors->at("request_prompt_type").getVal<int>());
+        FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids");
+    }
+    else if (valid_prompt_inputs > 0) {
+        FT_LOG_WARNING(
+            "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!");
+    }
+    if (request_prompt_type == PromptLearningType::prefix_prompt) {
+        FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!");
+    }
+
+    // Prefix Prompt Inputs
+    // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes
+    // pad)
+    // TODO (perkzz): move unnecessary paddings
+    const int* prompt_learning_task_name_ids =
+        input_tensors->count("prompt_learning_task_name_ids") ?
+            input_tensors->at("prompt_learning_task_name_ids").getPtr<const int>() :
+            nullptr;
+    has_prefix_prompt_ =
+        (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt);
+    int max_prefix_prompt_length = 0;
+
+    FT_CHECK_WITH_INFO(
+        !(prompt_learning_task_name_ids != nullptr
+          && (prompt_learning_type_ == PromptLearningType::no_prompt
+              || prompt_learning_type_ == PromptLearningType::soft_prompt)),
+        "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided.");
+
+    // NOTE: Prefix Prompt PreProcessing
+    // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
+    // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
+    std::vector<const T*> prefix_prompt_weight_batch_ptrs;
+    std::vector<int>      prefix_prompt_lengths;
+    if (has_prefix_prompt_) {
+        for (int bs_id = 0; bs_id < batch_size; ++bs_id) {
+            int task_id = prompt_learning_task_name_ids[bs_id];
+            // throw errors when prompt task_name_ids are not found
+            std::pair<const T*, int> prefix_prompt_weight_length_pair;
+            try {
+                prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id);
+            }
+            catch (const std::out_of_range& oor) {
+                FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id);
+                throw oor;
+            }
+            for (int bw_id = 0; bw_id < beam_width; ++bw_id) {
+                prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first);
+                prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second);
+            }
+        }
+
+        max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end());
+
+        FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length);
+
+        if (max_prefix_prompt_length == 0) {
+            has_prefix_prompt_ = false;
+            FT_LOG_DEBUG("prompts are not applied !");
+        }
+    }
+
+    int max_input_length = input_tensors->at("input_ids").shape[1];
+    FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0),
+                       "Prefix Prompt should come with inputs!");
+
+    // Prefix Soft Prompt
+    has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt;
+    const size_t max_prefix_soft_prompt_length =
+        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
+    const size_t limit_len_offset   = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0);
+    const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>() + limit_len_offset;
+    const size_t max_seq_len        = max_output_seq_len;
+    // max cache seq len should include max prefix prompt length as it has k/v states
+    const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length;
+    if (max_cache_seq_len < max_seq_len) {
+        FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). "
+                       "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.",
+                       max_cache_seq_len,
+                       max_seq_len);
+    }
+    else if (max_cache_seq_len > max_seq_len) {
+        FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). "
+                       "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.",
+                       max_cache_seq_len,
+                       max_seq_len);
+    }
+    const cudaDataType_t gemm_data_type = getCudaDataType<T>();
+    allocateBuffer(
+        batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length);
+    setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
+
+    sync_check_cuda_error();
+    {
+        TensorMap input_map(*input_tensors);
+        dynamic_decode_layer_->setup(batch_size, beam_width, &input_map);
+        handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size);
+        handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size);
+    }
+
+    const DataType data_type = getTensorType<T>();
+
+    const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
+                                                    batch_size * beam_width,
+                                                    local_head_num_,
+                                                    size_per_head_ / (16 / sizeof(T)),
+                                                    max_cache_seq_len,
+                                                    16 / sizeof(T)};
+    const std::vector<size_t> self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_,
+                                                    batch_size * beam_width,
+                                                    local_head_num_,
+                                                    max_cache_seq_len,
+                                                    size_per_head_};
+
+    // initialize the output ids and parent ids
+    cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_);
+    cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_);
+    if (beam_width > 1) {
+        cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    }
+
+    // Prefix prompts
+    if (has_prefix_prompt_) {
+        cudaMemcpyAsync(prompt_learning_weight_batch_,
+                        prefix_prompt_weight_batch_ptrs.data(),
+                        sizeof(T*) * batch_size * beam_width,
+                        cudaMemcpyDefault,
+                        stream_);
+        cudaMemcpyAsync(tiled_prompt_lengths_buf_,
+                        prefix_prompt_lengths.data(),
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyDefault,
+                        stream_);
+    }
+
+    sync_check_cuda_error();
+
+    // handle first step
+    if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) {
+        invokeTileGptInputs(tiled_input_ids_buf_,
+                            tiled_input_lengths_buf_,
+                            input_tensors->at("input_ids").getPtr<int>(),
+                            input_tensors->at("input_lengths").getPtr<const int>(),
+                            batch_size,
+                            beam_width,
+                            max_input_length,
+                            stream_);
+        sync_check_cuda_error();
+
+        if (has_prefix_soft_prompt_) {
+            inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param;
+            param.from_tensor                   = context_decoder_input_buf_;
+            param.output_ids                    = output_ids_buf_;
+            param.input_lengths                 = tiled_input_lengths_buf_;
+            param.embedding_table               = gpt_weights->pre_decoder_embedding_table;
+            param.pos_table                     = gpt_weights->position_encoding_table;
+            param.prefix_soft_prompt_embedding  = input_tensors->at("request_prompt_embedding").getPtr<float>();
+            param.prefix_soft_prompt_lengths    = input_tensors->at("request_prompt_lengths").getPtr<int>();
+            param.input_ids                     = tiled_input_ids_buf_;
+            param.start_step                    = 1;
+            param.max_input_length              = max_input_length;
+            param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length;
+            param.batch_size                    = batch_size;
+            param.beam_width                    = beam_width;
+            param.hidden_units                  = hidden_units_;
+            param.stream                        = stream_;
+
+            invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param);
+            sync_check_cuda_error();
+            max_input_length += max_prefix_soft_prompt_length;  // view soft_prompt as input
+        }
+        else {
+            invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
+                                                     output_ids_buf_,
+                                                     gpt_weights->pre_decoder_embedding_table,
+                                                     gpt_weights->position_encoding_table,
+                                                     pPromptTuningParam<T>{},  // no p/prompt tuning
+                                                     tiled_input_ids_buf_,
+                                                     1,
+                                                     max_input_length,
+                                                     max_input_length,
+                                                     batch_size * beam_width,
+                                                     hidden_units_,
+                                                     stream_);
+            sync_check_cuda_error();
+        }
+
+        invokeBuildDecoderAttentionMask(input_attention_mask_,
+                                        tiled_input_lengths_buf_,
+                                        tiled_prompt_lengths_buf_,
+                                        batch_size * beam_width,
+                                        max_input_length,
+                                        max_prefix_prompt_length,
+                                        stream_);
+        sync_check_cuda_error();
+
+        std::unordered_map<std::string, Tensor> decoder_input_tensors{
+            {"decoder_input",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
+                    context_decoder_input_buf_}},
+            {"attention_mask",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width,
+                     1,
+                     (size_t)max_input_length,
+                     (size_t)(max_input_length + max_prefix_prompt_length)},
+                    input_attention_mask_}},
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}},
+            {"d_prefix_prompt_batch",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width},
+                    has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}},
+            {"d_prefix_prompt_lengths",
+             Tensor{MEMORY_GPU,
+                    TYPE_INT32,
+                    {batch_size * beam_width},
+                    has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
+
+        std::unordered_map<std::string, Tensor> decoder_output_tensors{
+            {"decoder_output",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
+                    context_decoder_output_buf_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}},
+            {"last_token_hidden_units",
+             Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
+
+        gpt_context_decoder_->forward(
+            &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+        sync_check_cuda_error();
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 nullptr,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        sync_check_cuda_error();
+    }
+    else if (max_input_length == 0) {
+        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
+                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
+        max_input_length++;
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 output_ids_buf_,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        std::vector<int> h_input_lengths(batch_size * beam_width, 1);
+        cudaMemcpyAsync(tiled_input_lengths_buf_,
+                        h_input_lengths.data(),
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyHostToDevice,
+                        stream_);
+        sync_check_cuda_error();
+    }
+    else if (max_input_length == 1) {
+        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
+                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 nullptr,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        sync_check_cuda_error();
+        invokeTileGptInputs(tiled_input_ids_buf_,
+                            tiled_input_lengths_buf_,
+                            input_tensors->at("input_ids").getPtr<int>(),
+                            input_tensors->at("input_lengths").getPtr<const int>(),
+                            batch_size,
+                            beam_width,
+                            max_input_length,
+                            stream_);
+        sync_check_cuda_error();
+
+        cudaMemcpyAsync(output_ids_buf_,
+                        tiled_input_ids_buf_,
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+    }
+
+    if (vocab_size_ == vocab_size_padded_) {
+        padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel;
+    }
+    else {
+        cudaMemcpyAsync(padded_embedding_kernel_,
+                        gpt_weights->post_decoder_embedding.kernel,
+                        sizeof(T) * vocab_size_ * hidden_units_,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+        cudaMemcpyAsync(padded_embedding_bias_,
+                        gpt_weights->post_decoder_embedding.bias,
+                        sizeof(T) * vocab_size_,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeMaskPaddingTokens(masked_tokens_,
+                            input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+                            tiled_prompt_lengths_buf_,
+                            max_cache_seq_len,
+                            max_input_length + max_prefix_prompt_length,
+                            0,
+                            batch_size,
+                            beam_width,
+                            stream_);
+
+    for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
+        const int src_indir_idx = (step - max_input_length) % 2;
+        const int tgt_indir_idx = 1 - src_indir_idx;
+
+        const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_);
+        FT_CHECK(batch_size % local_batch_size == 0);
+        const size_t iteration_num = batch_size / local_batch_size;
+        *generation_should_stop_   = true;
+
+        for (uint ite = 0; ite < iteration_num; ++ite) {
+            const int id_offset               = ite * local_batch_size * beam_width;
+            const int hidden_units_offset     = id_offset * hidden_units_;
+            const int vocab_size_units_offset = id_offset * vocab_size_padded_;
+
+            if (!(max_input_length > 1 && step == max_input_length)) {
+                if (pipeline_para_.rank_ == 0) {
+                    invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset,
+                                                             gpt_weights->pre_decoder_embedding_table,
+                                                             gpt_weights->position_encoding_table,
+                                                             output_ids_buf_ + id_offset,
+                                                             tiled_total_padding_count_ + id_offset,
+                                                             local_batch_size * beam_width,
+                                                             hidden_units_,
+                                                             (T)(1.0f),
+                                                             step - 1,
+                                                             batch_size * beam_width,
+                                                             0,
+                                                             stream_);
+                    sync_check_cuda_error();
+                }
+                std::unordered_map<std::string, Tensor> decoder_input_tensors{
+                    {"decoder_input",
+                     Tensor{MEMORY_GPU,
+                            data_type,
+                            {local_batch_size * beam_width, hidden_units_},
+                            decoder_input_buf_ + hidden_units_offset}},
+                    {"finished",
+                     Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}},
+                    {"sequence_lengths",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}},
+                    {"total_padding_tokens",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size * beam_width},
+                            tiled_total_padding_count_ + id_offset}},
+                    {"d_prefix_prompt_lengths",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size},
+                            has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}},
+                    {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}},
+                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
+                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                    {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}},
+                    {"cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len :
+                                             nullptr}},
+                    {"masked_tokens",
+                     Tensor{MEMORY_GPU,
+                            TYPE_BOOL,
+                            {local_batch_size * beam_width, max_cache_seq_len},
+                            masked_tokens_ + id_offset * max_cache_seq_len}}};
+                std::unordered_map<std::string, Tensor> decoder_output_tensors{
+                    {"decoder_output",
+                     Tensor{MEMORY_GPU,
+                            data_type,
+                            {local_batch_size * beam_width, hidden_units_},
+                            decoder_output_buf_ + hidden_units_offset}},
+                    {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
+                    {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
+                gpt_decoder_->forward(
+                    &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+            }
+
+            if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
+                invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
+                                       decoder_output_buf_ + hidden_units_offset,
+                                       gpt_weights->post_decoder_layernorm.gamma,
+                                       gpt_weights->post_decoder_layernorm.beta,
+                                       layernorm_eps_,
+                                       local_batch_size * beam_width,
+                                       hidden_units_,
+                                       (float*)nullptr,
+                                       0,
+                                       stream_);
+                sync_check_cuda_error();
+
+                if (tensor_para_.world_size_ == 1) {
+                    float alpha = 1.0f;
+                    float beta  = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          vocab_size_padded_,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          logits_buf_ + vocab_size_units_offset,
+                                          CUDA_R_32F,
+                                          vocab_size_padded_, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                }
+                else {
+                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                    float     alpha            = 1.0f;
+                    float     beta             = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          local_vocab_size,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_
+                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          nccl_logits_buf_ + vocab_size_units_offset
+                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                                          CUDA_R_32F,
+                                          local_vocab_size, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                                    nccl_logits_buf_ + vocab_size_units_offset,
+                                    local_batch_size * beam_width * local_vocab_size,
+                                    tensor_para_.rank_,
+                                    tensor_para_,
+                                    stream_);
+                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                                          nccl_logits_buf_ + vocab_size_units_offset,
+                                          tensor_para_.world_size_,
+                                          local_batch_size * beam_width,
+                                          local_vocab_size,
+                                          stream_);
+                }
+
+                int                                     tmp_local_batch_size       = local_batch_size;
+                bool                                    is_initialize_random_table = step == max_input_length;
+                std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
+                    {"logits",
+                     Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}},
+                    // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}},
+                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
+                    {"input_lengths",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}},
+                    {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}},
+                    {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                    {"src_cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}},
+                    {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}},
+                    {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}},
+                    {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}};
+
+                for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+                    if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) {
+                        dynamic_decode_input_tensors.insert(*t);
+                    }
+                }
+
+                // common outputs
+                bool                                    subbatch_should_stop = false;
+                std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
+                    {"output_ids",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}},
+                    {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}},
+                    // cum_log_probs is necessary for beam search, while it is optional for sampling.
+                    {"cum_log_probs",
+                     Tensor{MEMORY_GPU,
+                            TYPE_FP32,
+                            {batch_size * beam_width},
+                            ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ :
+                                                                                                 nullptr}},
+                    {"output_log_probs",
+                     Tensor{MEMORY_GPU,
+                            TYPE_FP32,
+                            {max_seq_len, batch_size, beam_width},
+                            output_tensors->count("output_log_probs") > 0
+                                    && output_tensors->at("output_log_probs").data != nullptr ?
+                                output_log_probs_buf_ :
+                                nullptr}},
+                    {"parent_ids",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}},
+                    {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}},
+                    {"tgt_cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}},
+                    {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}};
+
+                for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) {
+                    // Handle exceptions.
+                    if (t->first == "cum_log_probs" || t->first == "output_log_probs") {
+                        continue;
+                    }
+                    dynamic_decode_output_tensors.insert(*t);
+                }
+
+                dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+                *generation_should_stop_ &= subbatch_should_stop;
+            }
+        }
+
+        if (pipeline_para_.world_size_ > 1) {
+            ftNcclGroupStart();
+            ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width,
+                            batch_size * beam_width,
+                            pipeline_para_.world_size_ - 1,
+                            pipeline_para_,
+                            stream_);
+
+            ftNcclBroadCast(
+                sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
+
+            ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
+
+            if (beam_width > 1) {
+                ftNcclBroadCast(cache_indirections_[tgt_indir_idx],
+                                batch_size * beam_width * max_output_seq_len,
+                                pipeline_para_.world_size_ - 1,
+                                pipeline_para_,
+                                stream_);
+            }
+            ftNcclGroupEnd();
+            // throw errors when detected
+            ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+            sync_check_cuda_error();
+        }
+
+        if (*generation_should_stop_) {
+            break;
+        }
+        if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) {
+            setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
+            sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
+
+            if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) {
+                token_generated_cb_(output_tensors, token_generated_ctx_);
+            }
+        }
+        if (step == max_input_length) {
+            /* We have just finished processing input: update the padding count:
+             * total_padding_count += (max_input_length - input_lengths)
+             * if has prefix prompts, += (max_prefix_prompt_length - prompt_length)
+             */
+            invokeUpdatePaddingCount(tiled_total_padding_count_,
+                                     input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+                                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr,
+                                     max_input_length,
+                                     has_prefix_prompt_ ? max_prefix_prompt_length : 0,
+                                     batch_size,
+                                     beam_width,
+                                     stream_);
+        }
+    }
+
+    setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
+    sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
+}
+
+template<typename T>
+void GptNeoX<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                                const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (pipeline_para_.world_size_ == 1) {
+        // throw errors when detected
+        ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+        return;
+    }
+
+    const auto pp_rank = pipeline_para_.rank_;
+
+    ftNcclGroupStart();
+    for (auto const& it : *output_tensors) {
+        if (it.second.data == nullptr) {
+            continue;
+        }
+
+        if (pp_rank == pipeline_para_.world_size_ - 1) {
+            ftNcclSend(it.second.getPtr<char>(), it.second.sizeBytes(), 0, pipeline_para_, stream_);
+        }
+        else if (pp_rank == 0) {
+            ftNcclRecv(it.second.getPtr<char>(),
+                       it.second.sizeBytes(),
+                       pipeline_para_.world_size_ - 1,
+                       pipeline_para_,
+                       stream_);
+        }
+    }
+    ftNcclGroupEnd();
+    // throw errors when detected
+    ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+}
+
+template<typename T>
+void GptNeoX<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                  const std::unordered_map<std::string, Tensor>* input_tensors,
+                                  const size_t                                   max_input_length,
+                                  const size_t                                   max_output_seq_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) {
+        return;
+    }
+
+    const size_t batch_size       = output_tensors->at("output_ids").shape[0];
+    const size_t beam_width       = output_tensors->at("output_ids").shape[1];
+    uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
+    const size_t max_prefix_soft_prompt_length =
+        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
+
+    if (input_tensors->at("input_ids").shape[1] == 0) {
+        invokeCudaD2DcpyConvert(
+            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
+        // TODO: D2D sequence_lenghts
+        if (beam_width > 1) {
+            // For beam search, do gather_tree
+            // take output_parent_ids as inter buffer
+            invokeGatherTree(transposed_output_ids_buf_,
+                             sequence_lengths_,
+                             max_output_seq_len,
+                             batch_size,
+                             beam_width,
+                             output_ids_buf_ + batch_size * beam_width,
+                             parent_ids_buf_ + batch_size * beam_width,
+                             end_ids_buf_,
+                             stream_);
+
+            // transpose and take output_parent_ids as inter buffer
+            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
+                                  transposed_output_ids_buf_,
+                                  max_output_seq_len - 1,
+                                  batch_size * beam_width,
+                                  1,
+                                  stream_);
+        }
+        else {
+            // For sampling, only copy the results to output_tensor
+            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
+                                  output_ids_buf_ + batch_size * beam_width,
+                                  max_output_seq_len - 1,
+                                  batch_size * beam_width,
+                                  1,
+                                  stream_);
+        }
+    }
+    else {
+
+        // For sampling, it is equivalent to all parent ids are 0.
+        gatherTreeParam param;
+        param.beams                = transposed_output_ids_buf_;
+        param.max_sequence_lengths = sequence_lengths_;
+        // add sequence_length 1 here because the sequence_length of time step t is t - 1
+        param.max_sequence_length_final_step = 1;
+        param.max_time                       = max_output_seq_len;
+        param.batch_size                     = batch_size;
+        param.beam_width                     = beam_width;
+        param.step_ids                       = output_ids_buf_;
+        param.parent_ids                     = beam_width == 1 ? nullptr : parent_ids_buf_;
+        param.end_tokens                     = end_ids_buf_;
+        param.max_input_length               = max_input_length;
+        param.prefix_soft_prompt_lengths =
+            has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr<int>() : nullptr;
+        param.input_lengths                   = tiled_input_lengths_buf_;
+        param.max_prefix_soft_prompt_length   = max_prefix_soft_prompt_length;
+        param.max_input_without_prompt_length = max_input_length;
+        param.stream                          = stream_;
+        param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
+        invokeGatherTree(param);
+        invokeCudaD2DcpyConvert(
+            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
+        sync_check_cuda_error();
+    }
+    if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) {
+        invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr<float>(),
+                              output_log_probs_buf_,
+                              input_tensors->at("output_seq_len").max<uint32_t>() - max_input_length,
+                              batch_size * beam_width,
+                              1,
+                              stream_);
+    }
+    // Return the cumulative log probability if requested.
+    if (output_tensors->count("cum_log_probs") > 0) {
+        Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+        FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width,
+                           "The shape of cum_log_probs does not match with batch_size x beam_width.");
+        cudaAutoCpy(cum_log_probs.getPtr<float>(), cum_log_probs_, cum_log_probs.size(), stream_);
+    }
+}
+
+template<typename T>
+size_t GptNeoX<T>::getPipelineParallelRank()
+{
+    return pipeline_para_.rank_;
+}
+
+template<typename T>
+size_t GptNeoX<T>::getPipelineParallelSize()
+{
+    return pipeline_para_.world_size_;
+}
+
+template<typename T>
+size_t GptNeoX<T>::getTensorParallelRank()
+{
+    return tensor_para_.rank_;
+}
+
+template<typename T>
+size_t GptNeoX<T>::getTensorParallelSize()
+{
+    return tensor_para_.world_size_;
+}
+
+template<typename T>
+bool* GptNeoX<T>::getFinishBuffer()
+{
+    return finished_buf_;
+}
+
+template class GptNeoX<float>;
+template class GptNeoX<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
new file mode 100644
index 000000000..9749a2070
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/prompt_learning.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class GptNeoX: public BaseLayer {
+private:
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t vocab_size_;
+    size_t rotary_embedding_dim_;
+
+    static constexpr bool  neox_rotary_style_ = true;
+    static constexpr float layernorm_eps_     = 1e-5f;
+
+    int    start_id_;
+    int    end_id_;
+    size_t hidden_units_;
+
+    size_t    local_head_num_;
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    AttentionType attention_type_;
+
+    size_t     vocab_size_padded_;
+    const bool is_context_qk_buf_float_ =
+        (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr ||
+         std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
+
+    // Residual Type
+    const bool use_gptj_residual_ = true;
+
+    // Prompt Learning Parameters
+    PromptLearningType prompt_learning_type_;
+    int                prompt_learning_start_id_;  // start_id for prompt_learning (only needed by prefix prompts)
+    bool               has_prefix_prompt_;
+    bool               has_prefix_soft_prompt_;
+
+    GptNeoXDecoder<T>*         gpt_decoder_;
+    GptNeoXContextDecoder<T>*  gpt_context_decoder_;
+    DynamicDecodeLayer<float>* dynamic_decode_layer_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(
+        size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
+    void freeBuffer() override;
+
+    void initialize();
+
+protected:
+    T*       padded_embedding_kernel_;
+    T*       padded_embedding_bias_;
+    const T* padded_embedding_kernel_ptr_;
+
+    T* input_attention_mask_;
+
+    T* decoder_input_buf_;
+    T* decoder_output_buf_;
+    T* normed_decoder_output_buf_;
+
+    float* logits_buf_;
+    float* nccl_logits_buf_;
+    float* cum_log_probs_;
+
+    bool*     finished_buf_;
+    bool*     h_finished_buf_;
+    int*      sequence_lengths_          = nullptr;
+    int*      tiled_total_padding_count_ = nullptr;
+    uint32_t* seq_limit_len_             = nullptr;
+
+    T*   key_cache_;
+    T*   value_cache_;
+    int* cache_indirections_[2] = {nullptr, nullptr};
+
+    // prompt_learning weight_batch ptrs
+    const T** prompt_learning_weight_batch_;
+    int*      tiled_prompt_lengths_buf_;  // only needed by prefix prompts
+
+    int*  tiled_input_ids_buf_;
+    int*  tiled_input_lengths_buf_;
+    int*  transposed_output_ids_buf_;
+    int*  output_ids_buf_;
+    int*  parent_ids_buf_;
+    int*  start_ids_buf_;
+    int*  end_ids_buf_;
+    bool* masked_tokens_ = nullptr;
+
+    bool* generation_should_stop_ = nullptr;
+
+    T*     context_decoder_input_buf_;
+    T*     context_decoder_output_buf_;
+    float* output_log_probs_buf_;
+
+    // function pointer callback
+    using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
+    callback_sig* token_generated_cb_  = nullptr;
+    void*         token_generated_ctx_ = nullptr;
+
+    void setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
+                          const std::unordered_map<std::string, Tensor>* input_tensors,
+                          const size_t                                   max_input_length,
+                          const size_t                                   max_seq_len);
+    void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                        const std::unordered_map<std::string, Tensor>* input_tensors);
+
+public:
+    GptNeoX(size_t                              head_num,
+            size_t                              size_per_head,
+            size_t                              inter_size,
+            size_t                              num_layer,
+            size_t                              vocab_size,
+            size_t                              rotary_embedding_dim,
+            int                                 start_id,
+            int                                 end_id,
+            int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+            PromptLearningType                  prompt_learning_type,
+            bool                                use_gptj_residual,
+            float                               beam_search_diversity_rate,
+            size_t                              top_k,
+            float                               top_p,
+            unsigned long long                  random_seed,
+            float                               temperature,
+            float                               len_penalty,
+            float                               repetition_penalty,
+            cudaStream_t                        stream,
+            cublasMMWrapper*                    cublas_wrapper,
+            IAllocator*                         allocator,
+            bool                                is_free_buffer_after_forward,
+            cudaDeviceProp*                     cuda_device_prop         = nullptr,
+            AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+            int                                 enable_custom_all_reduce = 0);
+
+    GptNeoX(size_t                              head_num,
+            size_t                              size_per_head,
+            size_t                              inter_size,
+            size_t                              num_layer,
+            size_t                              vocab_size,
+            size_t                              rotary_embedding_dim,
+            int                                 start_id,
+            int                                 end_id,
+            int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+            PromptLearningType                  prompt_learning_type,
+            bool                                use_gptj_residual,
+            float                               beam_search_diversity_rate,
+            size_t                              top_k,
+            float                               top_p,
+            unsigned long long                  random_seed,
+            float                               temperature,
+            float                               len_penalty,
+            float                               repetition_penalty,
+            NcclParam                           tensor_para,
+            NcclParam                           pipeline_para,
+            cudaStream_t                        stream,
+            cublasMMWrapper*                    cublas_wrapper,
+            IAllocator*                         allocator,
+            bool                                is_free_buffer_after_forward,
+            cudaDeviceProp*                     cuda_device_prop         = nullptr,
+            AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+            int                                 enable_custom_all_reduce = 0);
+
+    GptNeoX(GptNeoX<T> const& GptNeoX);
+
+    ~GptNeoX();
+
+    void forward(std::vector<Tensor>*       output_tensors,
+                 const std::vector<Tensor>* input_tensors,
+                 const GptNeoXWeight<T>*    gpt_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors,
+                 const GptNeoXWeight<T>*                        gpt_weights);
+
+    size_t getPipelineParallelRank();
+    size_t getPipelineParallelSize();
+    size_t getTensorParallelRank();
+    size_t getTensorParallelSize();
+    bool*  getFinishBuffer();
+
+    void registerCallback(callback_sig* fn, void* ctx);
+    void unRegisterCallback();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
new file mode 100644
index 000000000..f23d1a977
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+
+#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void GptNeoXContextDecoder<T>::initialize()
+{
+    self_attention_layer_ = new TensorParallelGptContextAttentionLayer<T>(0,  // max_batch_size
+                                                                          0,  // max_seq_len
+                                                                          head_num_,
+                                                                          size_per_head_,
+                                                                          rotary_embedding_dim_,
+                                                                          neox_rotary_style_,
+                                                                          tensor_para_,
+                                                                          stream_,
+                                                                          cublas_wrapper_,
+                                                                          allocator_,
+                                                                          !use_gptj_residual_,
+                                                                          is_free_buffer_after_forward_,
+                                                                          is_qk_buf_float_,
+                                                                          false,
+                                                                          0,
+                                                                          custom_all_reduce_comm_,
+                                                                          enable_custom_all_reduce_);
+
+    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+                                                   0,  // max_seq_len
+                                                   head_num_,
+                                                   size_per_head_,
+                                                   0,  // expert_num
+                                                   inter_size_,
+                                                   tensor_para_,
+                                                   stream_,
+                                                   cublas_wrapper_,
+                                                   allocator_,
+                                                   !use_gptj_residual_,
+                                                   is_free_buffer_after_forward_,
+                                                   false,
+                                                   0,
+                                                   false,  // use_gated_activation = false;
+                                                   custom_all_reduce_comm_,
+                                                   enable_custom_all_reduce_);
+}
+
+template<typename T>
+void GptNeoXContextDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+{
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    self_attn_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    ffn_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    decoder_layer_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+    padding_offset_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
+    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void GptNeoXContextDecoder<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
+        allocator_->free((void**)(&self_attn_output_));
+        allocator_->free((void**)(&ffn_output_));
+        allocator_->free((void**)(&decoder_layer_output_));
+        allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
+        allocator_->free((void**)(&padding_offset_));
+        allocator_->free((void**)(&cu_seqlens_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool GptNeoXContextDecoder<T>::isValidLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
+           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
+}
+
+template<typename T>
+bool GptNeoXContextDecoder<T>::isFirstLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
+}
+
+template<typename T>
+bool GptNeoXContextDecoder<T>::isLastLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
+}
+
+template<typename T>
+int GptNeoXContextDecoder<T>::getFirstLayerParallelId()
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return local_num_layer * pipeline_para_.rank_;
+}
+
+template<typename T>
+GptNeoXContextDecoder<T>::GptNeoXContextDecoder(size_t                              head_num,
+                                                size_t                              size_per_head,
+                                                size_t                              inter_size,
+                                                size_t                              num_layer,
+                                                size_t                              rotary_embedding_dim,
+                                                bool                                neox_rotary_style,
+                                                bool                                use_gptj_residual,
+                                                float                               layernorm_eps,
+                                                NcclParam                           tensor_para,
+                                                NcclParam                           pipeline_para,
+                                                cudaStream_t                        stream,
+                                                cublasMMWrapper*                    cublas_wrapper,
+                                                IAllocator*                         allocator,
+                                                bool                                is_free_buffer_after_forward,
+                                                bool                                is_qk_buf_float,
+                                                AttentionType                       attention_type,
+                                                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                                                int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    neox_rotary_style_(neox_rotary_style),
+    use_gptj_residual_(use_gptj_residual),
+    layernorm_eps_(layernorm_eps),
+    hidden_units_(head_num * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    is_qk_buf_float_(is_qk_buf_float),
+    attention_type_(attention_type),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    initialize();
+}
+
+template<typename T>
+GptNeoXContextDecoder<T>::GptNeoXContextDecoder(GptNeoXContextDecoder<T> const& decoder):
+    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
+    head_num_(decoder.head_num_),
+    size_per_head_(decoder.size_per_head_),
+    inter_size_(decoder.inter_size_),
+    num_layer_(decoder.num_layer_),
+    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
+    neox_rotary_style_(decoder.neox_rotary_style_),
+    use_gptj_residual_(decoder.use_gptj_residual_),
+    layernorm_eps_(decoder.layernorm_eps_),
+    hidden_units_(decoder.hidden_units_),
+    tensor_para_(decoder.tensor_para_),
+    pipeline_para_(decoder.pipeline_para_),
+    is_qk_buf_float_(decoder.is_qk_buf_float_),
+    attention_type_(decoder.attention_type_),
+    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
+{
+    initialize();
+}
+
+template<typename T>
+GptNeoXContextDecoder<T>::~GptNeoXContextDecoder()
+{
+    delete self_attention_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void GptNeoXContextDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+                                       const std::vector<Tensor>*                        input_tensors,
+                                       const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
+                                                              {"attention_mask", input_tensors->at(1)},
+                                                              {"input_lengths", input_tensors->at(2)}};
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
+                                                               {"key_cache", output_tensors->at(1)},
+                                                               {"value_cache", output_tensors->at(2)},
+                                                               {"last_token_hidden_units", output_tensors->at(3)}};
+
+    forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight);
+}
+
+template<typename T>
+void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                                       const std::unordered_map<std::string, Tensor>*    input_tensors,
+                                       const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    // input tensors:
+    //      decoder_input [batch_size, seq_len, hidden_dimension],
+    //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
+    //      input_lengths [batch_size]
+    //      d_prefix_prompt_batch [batch_size],
+    //          each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head]
+    //      prefix_prompt_lengths [batch size]
+
+    // output tensors:
+    //      decoder_output [batch_size, seq_len, hidden_dimension],
+    //      key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
+    //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+    //      last_token_hidden_units [batch_size, hidden_dimension]
+
+    // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'.
+    // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
+    // computing.
+
+    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(output_tensors->size() == 4);
+
+    const int batch_size = input_tensors->at("decoder_input").shape[0];
+    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const int max_prompt_length =
+        input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
+    const DataType data_type = getTensorType<T>();
+    allocateBuffer(batch_size, seq_len);
+
+    T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
+    T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
+    const T*   attention_mask          = input_tensors->at("attention_mask").getPtr<const T>();
+    const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
+    const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
+
+    const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
+    FT_CHECK(batch_size % local_batch_size == 0);
+    const int iteration_num = batch_size / local_batch_size;
+
+    Tensor&             k_cache = output_tensors->at("key_cache");
+    Tensor&             v_cache = output_tensors->at("value_cache");
+    std::vector<size_t> self_k_cache_size;
+    self_k_cache_size.push_back(local_batch_size);
+    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+        self_k_cache_size.push_back(*t);
+    }
+    std::vector<size_t> self_v_cache_size;
+    self_v_cache_size.push_back(local_batch_size);
+    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
+        self_v_cache_size.push_back(*t);
+    }
+
+    AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
+                                        getUnfusedAttentionType(attention_type_) :
+                                        attention_type_;
+    const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
+
+    for (int ite = 0; ite < iteration_num; ite++) {
+        size_t h_token_num = local_batch_size * seq_len;
+        if (is_unpadded_mha) {
+            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                               &h_token_num,
+                                               padding_offset_,
+                                               cu_seqlens_,
+                                               base_input_lengths + ite * local_batch_size,
+                                               local_batch_size,
+                                               seq_len,
+                                               stream_);
+        }
+        for (int l = 0; l < num_layer_; l++) {
+            if (isValidLayerParallelId(l) == false) {
+                continue;
+            }
+
+            if (l == 0 && is_unpadded_mha) {
+                invokeRemovePadding(decoder_layer_output_,
+                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    padding_offset_,
+                                    h_token_num,
+                                    hidden_units_,
+                                    stream_);
+            }
+
+            const bool is_final     = false;  // TODO(bhsueh) remove this flag
+            T*         layer_input  = decoder_layer_output_;
+            T*         layer_output = decoder_layer_output_;
+            if (!is_unpadded_mha) {
+                if (l == 0) {
+                    layer_input = decoder_input;
+                    layer_input += ite * local_batch_size * seq_len * hidden_units_;
+                }
+                if (l == num_layer_ - 1) {
+                    layer_output = decoder_output;
+                    layer_output += ite * local_batch_size * seq_len * hidden_units_;
+                }
+            }
+
+            if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
+                int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
+                ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+                           data_size,
+                           pipeline_para_.rank_ - 1,
+                           pipeline_para_,
+                           stream_);
+                if (tensor_para_.world_size_ > 1) {
+                    ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
+                }
+            }
+
+            invokeGeneralLayerNorm(decoder_normed_input_,
+                                   layer_input,
+                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                   layernorm_eps_,
+                                   h_token_num,
+                                   hidden_units_,
+                                   (float*)nullptr,
+                                   0,
+                                   stream_);
+            sync_check_cuda_error();
+
+            TensorMap self_attention_input_tensors{
+                {"input_query",
+                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
+                {"attention_mask",
+                 Tensor{MEMORY_GPU,
+                        data_type,
+                        {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
+                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
+                {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
+                {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
+            self_attention_input_tensors.insertIfValid(
+                "d_prefix_prompt_batch",
+                Tensor{MEMORY_GPU,
+                       data_type,
+                       {(size_t)local_batch_size},
+                       d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr});
+            self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths",
+                                                       Tensor{MEMORY_GPU,
+                                                              TYPE_INT32,
+                                                              {(size_t)local_batch_size},
+                                                              d_prefix_prompt_lengths != nullptr ?
+                                                                  d_prefix_prompt_lengths + ite * local_batch_size :
+                                                                  nullptr});
+
+            if (is_unpadded_mha) {
+                self_attention_input_tensors.insert("padding_offset",
+                                                    Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_});
+                self_attention_input_tensors.insert(
+                    "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_});
+            }
+
+            size_t cache_offset = l - getFirstLayerParallelId();
+            for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+                cache_offset *= *t;
+            };
+            size_t ite_cache_offset = ite * local_batch_size;
+            for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+                ite_cache_offset *= *t;
+            }
+            cache_offset += ite_cache_offset;
+
+            TensorMap self_attention_output_tensors{
+                {"hidden_features",
+                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+                {"value_cache",
+                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+            self_attention_layer_->forward(&self_attention_output_tensors,
+                                           &self_attention_input_tensors,
+                                           &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+
+            if (is_final == false) {
+                if (use_gptj_residual_) {
+                    invokeGeneralLayerNorm(decoder_normed_input_,
+                                           layer_input,
+                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                           layernorm_eps_,
+                                           h_token_num,
+                                           hidden_units_,
+                                           (float*)nullptr,
+                                           0,
+                                           stream_);
+                }
+                else {
+                    invokeGeneralAddBiasResidualPreLayerNorm(
+                        self_attn_output_,
+                        decoder_normed_input_,
+                        self_attn_output_,
+                        layer_input,
+                        gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                        gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                        gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                        layernorm_eps_,
+                        h_token_num,
+                        hidden_units_,
+                        (float*)nullptr,
+                        (float*)nullptr,
+                        (float*)nullptr,
+                        (float*)nullptr,
+                        0,
+                        stream_);
+                }
+
+                TensorMap ffn_input_tensors(
+                    {{"ffn_input",
+                      Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
+                TensorMap ffn_output_tensors({{"ffn_output",
+                                               Tensor{MEMORY_GPU,
+                                                      data_type,
+                                                      {h_token_num, (size_t)hidden_units_},
+                                                      use_gptj_residual_ ? ffn_output_ : layer_output}}});
+                ffn_layer_->forward(
+                    &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+
+                if (use_gptj_residual_) {
+                    // Original workflow:
+                    //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
+                    // Our workflow:
+                    //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input /
+                    //      TP_size)
+                    // They are equivalent on math, but we can use same buffer for layer_input and layer_output
+
+                    invokeAddBiasAttentionFfnResidual(layer_output,
+                                                      ffn_output_,
+                                                      self_attn_output_,
+                                                      layer_input,
+                                                      gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                                      h_token_num,
+                                                      hidden_units_,
+                                                      tensor_para_.world_size_,
+                                                      stream_);
+                    if (tensor_para_.world_size_ > 1) {
+                        ftNcclAllReduceSum(
+                            layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_);
+                    }
+                }
+                else {
+                    invokeAddBiasResidual(layer_output,
+                                          self_attn_output_,
+                                          gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                          h_token_num,
+                                          hidden_units_,
+                                          stream_);
+                }
+
+                sync_check_cuda_error();
+
+                if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+                    && pipeline_para_.world_size_ > 1) {
+                    int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
+                    ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+                               data_size,
+                               pipeline_para_.rank_ + 1,
+                               pipeline_para_,
+                               stream_);
+                }
+
+                if ((l == num_layer_ - 1) && is_unpadded_mha) {
+                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                                         decoder_layer_output_,
+                                         padding_offset_,
+                                         h_token_num,
+                                         head_num_ * size_per_head_,
+                                         stream_);
+                }
+            }
+        }
+    }
+
+    // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
+    invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
+                                       output_tensors->at("decoder_output").getPtr<T>(),
+                                       input_tensors->at("input_lengths").getPtr<int>(),
+                                       seq_len,
+                                       batch_size,
+                                       hidden_units_,
+                                       stream_);
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+}
+
+template class GptNeoXContextDecoder<float>;
+template class GptNeoXContextDecoder<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
new file mode 100644
index 000000000..c81dcfe90
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "src/fastertransformer/kernels/add_residual_kernels.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class GptNeoXContextDecoder: public BaseLayer {
+private:
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    bool   neox_rotary_style_;
+    bool   use_gptj_residual_;
+    float  layernorm_eps_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    AttentionType attention_type_;
+
+    bool is_qk_buf_float_;
+
+    BaseAttentionLayer<T>* self_attention_layer_;
+    FfnLayer<T>*           ffn_layer_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void freeBuffer() override;
+
+    bool isValidLayerParallelId(uint l);
+    bool isFirstLayerParallelId(uint l);
+    bool isLastLayerParallelId(uint l);
+    int  getFirstLayerParallelId();
+
+    void initialize();
+
+protected:
+    T*      decoder_normed_input_   = nullptr;
+    T*      self_attn_output_       = nullptr;
+    T*      ffn_output_             = nullptr;
+    T*      decoder_layer_output_   = nullptr;
+    size_t* h_pinned_token_num_ptr_ = nullptr;
+    int*    padding_offset_         = nullptr;
+    int*    cu_seqlens_             = nullptr;
+
+public:
+    GptNeoXContextDecoder(size_t                              head_num,
+                          size_t                              size_per_head,
+                          size_t                              inter_size,
+                          size_t                              num_layer,
+                          size_t                              rotary_embedding_dim,
+                          bool                                neox_rotary_style,
+                          bool                                use_gptj_residual,
+                          float                               layernorm_eps,
+                          NcclParam                           tensor_para,
+                          NcclParam                           pipeline_para,
+                          cudaStream_t                        stream,
+                          cublasMMWrapper*                    cublas_wrapper,
+                          IAllocator*                         allocator,
+                          bool                                is_free_buffer_after_forward,
+                          bool                                is_qk_buf_float,
+                          AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
+                          int                                 enable_custom_all_reduce_ = 0);
+
+    GptNeoXContextDecoder(GptNeoXContextDecoder<T> const& decoder);
+
+    ~GptNeoXContextDecoder();
+
+    void forward(std::vector<Tensor>*                              output_tensors,
+                 const std::vector<Tensor>*                        input_tensors,
+                 const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                 const std::unordered_map<std::string, Tensor>*    input_tensors,
+                 const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc
new file mode 100644
index 000000000..7b73ba8ee
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoder.cc
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h"
+#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void GptNeoXDecoder<T>::initialize()
+{
+    self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer<T>(0,  // max_batch_size
+                                                                           head_num_,
+                                                                           size_per_head_,
+                                                                           rotary_embedding_dim_,
+                                                                           neox_rotary_style_,
+                                                                           tensor_para_,
+                                                                           stream_,
+                                                                           cublas_wrapper_,
+                                                                           allocator_,
+                                                                           !use_gptj_residual_,
+                                                                           is_free_buffer_after_forward_,
+                                                                           false,
+                                                                           0,
+                                                                           custom_all_reduce_comm_,
+                                                                           enable_custom_all_reduce_);
+
+    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+                                                   1,
+                                                   head_num_,
+                                                   size_per_head_,
+                                                   0,  // expert_num
+                                                   inter_size_,
+                                                   tensor_para_,
+                                                   stream_,
+                                                   cublas_wrapper_,
+                                                   allocator_,
+                                                   !use_gptj_residual_,
+                                                   is_free_buffer_after_forward_,
+                                                   false,
+                                                   0,
+                                                   false,  // use_gated_activation = false;
+                                                   custom_all_reduce_comm_,
+                                                   enable_custom_all_reduce_);
+}
+
+template<typename T>
+void GptNeoXDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void GptNeoXDecoder<T>::allocateBuffer(size_t batch_size)
+{
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false));
+    self_attn_output_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false));
+    ffn_output_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false));
+    decoder_layer_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void GptNeoXDecoder<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
+        allocator_->free((void**)(&self_attn_output_));
+        allocator_->free((void**)(&ffn_output_));
+        allocator_->free((void**)(&decoder_layer_output_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool GptNeoXDecoder<T>::isValidLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
+           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
+}
+
+template<typename T>
+bool GptNeoXDecoder<T>::isFirstLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
+}
+
+template<typename T>
+bool GptNeoXDecoder<T>::isLastLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
+}
+
+template<typename T>
+int GptNeoXDecoder<T>::getFirstLayerParallelId()
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return local_num_layer * pipeline_para_.rank_;
+}
+
+template<typename T>
+GptNeoXDecoder<T>::GptNeoXDecoder(size_t                              head_num,
+                                  size_t                              size_per_head,
+                                  size_t                              inter_size,
+                                  size_t                              num_layer,
+                                  size_t                              rotary_embedding_dim,
+                                  bool                                neox_rotary_style,
+                                  bool                                use_gptj_residual,
+                                  float                               layernorm_eps,
+                                  NcclParam                           tensor_para,
+                                  NcclParam                           pipeline_para,
+                                  cudaStream_t                        stream,
+                                  cublasMMWrapper*                    cublas_wrapper,
+                                  IAllocator*                         allocator,
+                                  bool                                is_free_buffer_after_forward,
+                                  std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                                  int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    neox_rotary_style_(neox_rotary_style),
+    use_gptj_residual_(use_gptj_residual),
+    layernorm_eps_(layernorm_eps),
+    hidden_units_(head_num_ * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    initialize();
+}
+
+template<typename T>
+GptNeoXDecoder<T>::GptNeoXDecoder(GptNeoXDecoder<T> const& decoder):
+    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
+    head_num_(decoder.head_num_),
+    size_per_head_(decoder.size_per_head_),
+    inter_size_(decoder.inter_size_),
+    num_layer_(decoder.num_layer_),
+    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
+    neox_rotary_style_(decoder.neox_rotary_style_),
+    use_gptj_residual_(decoder.use_gptj_residual_),
+    layernorm_eps_(decoder.layernorm_eps_),
+    hidden_units_(decoder.hidden_units_),
+    tensor_para_(decoder.tensor_para_),
+    pipeline_para_(decoder.pipeline_para_),
+    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
+{
+    initialize();
+}
+
+template<typename T>
+GptNeoXDecoder<T>::~GptNeoXDecoder()
+{
+    delete self_attention_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void GptNeoXDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+                                const std::vector<Tensor>*                        input_tensors,
+                                const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                                const std::unordered_map<std::string, Tensor>*    input_tensors,
+                                const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    // input tensors:
+    //      decoder_input [local_batch_size, hidden_dimension],
+    //      finished [local_batch_size],
+    //      sequence_lengths [local_batch_size]
+    //      total_padding_tokens [local_batch_size],
+    //      max_input_length [1] on cpu
+    //      d_prefix_prompt_lengths [local_batch_size], on GPU
+    //      max_prefix_prompt_length [1] on cpu
+    //      step [1] on cpu
+    //      ite [1] on cpu
+    //      cache_indirection [local_batch_size / beam_width, beam_width, memory_len]
+    //              Here, local_batch_size contains the beam_width, so local_batch_size / beam_width
+    //              is real local_batch_size.
+    //      masked_tokens[local_batch_size, memory_len]
+
+    // output tensors:
+    //      decoder_output [local_batch_size, hidden_dimension],
+    //      key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x]
+    //      value_cache [num_layer, batch_size, head_num, memory_len, size_per_head]
+
+    FT_CHECK(input_tensors->size() == 11);
+    FT_CHECK(output_tensors->size() == 3);
+
+    const DataType data_type        = getTensorType<T>();
+    const size_t   local_batch_size = input_tensors->at("decoder_input").shape[0];
+    allocateBuffer(local_batch_size);
+    const int ite = input_tensors->at("ite").getVal<const int>();
+
+    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+
+    Tensor&             k_cache = output_tensors->at("key_cache");
+    Tensor&             v_cache = output_tensors->at("value_cache");
+    std::vector<size_t> self_k_cache_size;
+    self_k_cache_size.push_back(local_batch_size);
+    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+        self_k_cache_size.push_back(*t);
+    }
+    std::vector<size_t> self_v_cache_size;
+    self_v_cache_size.push_back(local_batch_size);
+    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
+        self_v_cache_size.push_back(*t);
+    }
+
+    for (uint l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l) == false) {
+            continue;
+        }
+        T* layer_input  = (l == 0) ? decoder_input : decoder_layer_output_;
+        T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_;
+
+        if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
+            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
+            // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
+            // stream_);
+
+            ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+                       data_size,
+                       pipeline_para_.rank_ - 1,
+                       pipeline_para_,
+                       stream_);
+            if (tensor_para_.world_size_ > 1) {
+                ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
+            }
+        }
+
+        invokeGeneralLayerNorm(decoder_normed_input_,
+                               layer_input,
+                               gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                               gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                               layernorm_eps_,
+                               local_batch_size,
+                               hidden_units_,
+                               (float*)nullptr,
+                               0,
+                               stream_);
+        sync_check_cuda_error();
+
+        TensorMap self_attention_input_tensors(*input_tensors);
+        self_attention_input_tensors.insert(
+            "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_});
+
+        size_t cache_offset = l - getFirstLayerParallelId();
+        for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+            cache_offset *= *t;
+        };
+        size_t ite_cache_offset = ite * local_batch_size;
+        for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+            ite_cache_offset *= *t;
+        }
+        cache_offset += ite_cache_offset;
+
+        TensorMap self_attention_output_tensors{
+            {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+        self_attention_layer_->forward(&self_attention_output_tensors,
+                                       &self_attention_input_tensors,
+                                       &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+        if (use_gptj_residual_) {
+            invokeGeneralLayerNorm(decoder_normed_input_,
+                                   layer_input,
+                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                   layernorm_eps_,
+                                   local_batch_size,
+                                   hidden_units_,
+                                   (float*)nullptr,
+                                   0,
+                                   stream_);
+        }
+        else {
+            invokeGeneralAddBiasResidualPreLayerNorm(
+                self_attn_output_,
+                decoder_normed_input_,
+                self_attn_output_,
+                layer_input,
+                gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                layernorm_eps_,
+                local_batch_size,
+                hidden_units_,
+                (float*)nullptr,
+                (float*)nullptr,
+                (float*)nullptr,
+                (float*)nullptr,
+                0,
+                stream_);
+        }
+
+        TensorMap ffn_input_tensors(
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}});
+        TensorMap ffn_output_tensors({{"ffn_output",
+                                       Tensor{MEMORY_GPU,
+                                              data_type,
+                                              {local_batch_size, hidden_units_},
+                                              use_gptj_residual_ ? ffn_output_ : layer_output}}});
+        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+
+        if (use_gptj_residual_) {
+            // Original workflow:
+            //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
+            // Our workflow:
+            //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size)
+            // They are equivalent on math, but we can use same buffer for layer_input and layer_output
+            invokeAddBiasAttentionFfnResidual(layer_output,
+                                              ffn_output_,
+                                              self_attn_output_,
+                                              layer_input,
+                                              gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                              local_batch_size,
+                                              hidden_units_,
+                                              tensor_para_.world_size_,
+                                              stream_);
+            if (tensor_para_.world_size_ > 1) {
+                ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_);
+            }
+        }
+        else {
+            invokeAddBiasResidual(layer_output,
+                                  self_attn_output_,
+                                  gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                  local_batch_size,
+                                  hidden_units_,
+                                  stream_);
+        }
+
+        sync_check_cuda_error();
+
+        if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+            && pipeline_para_.world_size_ > 1) {
+            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
+            // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_,
+            // stream_);
+
+            ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+                       data_size,
+                       pipeline_para_.rank_ + 1,
+                       pipeline_para_,
+                       stream_);
+        }
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+}
+
+template class GptNeoXDecoder<float>;
+template class GptNeoXDecoder<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h
new file mode 100644
index 000000000..add736adc
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoder.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "src/fastertransformer/kernels/add_residual_kernels.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class GptNeoXDecoder: public BaseLayer {
+private:
+protected:
+    void         allocateBuffer() override;
+    void         allocateBuffer(size_t batch_size);
+    void         freeBuffer() override;
+    bool         isValidLayerParallelId(uint l);
+    bool         isFirstLayerParallelId(uint l);
+    bool         isLastLayerParallelId(uint l);
+    int          getFirstLayerParallelId();
+    virtual void initialize();
+
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    bool   neox_rotary_style_;
+    bool   use_gptj_residual_;
+    size_t hidden_units_;
+    float  layernorm_eps_;
+
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    T* decoder_normed_input_ = nullptr;
+    T* self_attn_output_     = nullptr;
+    T* ffn_output_           = nullptr;
+    T* decoder_layer_output_ = nullptr;
+
+    BaseAttentionLayer<T>* self_attention_layer_;
+    FfnLayer<T>*           ffn_layer_;
+
+public:
+    GptNeoXDecoder(size_t                              head_num,
+                   size_t                              size_per_head,
+                   size_t                              inter_size,
+                   size_t                              num_layer,
+                   size_t                              rotary_embedding_dim,
+                   bool                                neox_rotary_style,
+                   bool                                use_gptj_residual,
+                   float                               layernorm_eps,
+                   NcclParam                           tensor_para,
+                   NcclParam                           pipeline_para,
+                   cudaStream_t                        stream,
+                   cublasMMWrapper*                    cublas_wrapper,
+                   IAllocator*                         allocator,
+                   bool                                is_free_buffer_after_forward,
+                   std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
+                   int                                 enable_custom_all_reduce_ = 0);
+
+    GptNeoXDecoder(GptNeoXDecoder<T> const& decoder);
+
+    virtual ~GptNeoXDecoder();
+
+    virtual void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                         const std::unordered_map<std::string, Tensor>*    input_tensors,
+                         const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    virtual void forward(std::vector<Tensor>*                              output_tensors,
+                         const std::vector<Tensor>*                        input_tensors,
+                         const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
new file mode 100644
index 000000000..3d62df83d
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const int  hidden_units,
+                                                        const int  inter_size,
+                                                        const int  tensor_para_size,
+                                                        const int  tensor_para_rank,
+                                                        const bool use_gptj_residual):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    use_gptj_residual_(use_gptj_residual)
+{
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>::~GptNeoXDecoderLayerWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < 12; i++) {
+            if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) {
+                cudaFree(weights_ptr[i]);
+            }
+        }
+
+        pre_layernorm_weights.beta                            = nullptr;
+        pre_layernorm_weights.gamma                           = nullptr;
+        self_attention_weights.query_weight.kernel            = nullptr;
+        self_attention_weights.query_weight.bias              = nullptr;
+        self_attention_weights.attention_output_weight.kernel = nullptr;
+        self_attention_weights.attention_output_weight.bias   = nullptr;
+        post_attention_layernorm_weights.beta                 = nullptr;
+        post_attention_layernorm_weights.gamma                = nullptr;
+
+        ffn_weights.intermediate_weight.kernel = nullptr;
+        ffn_weights.intermediate_weight.bias   = nullptr;
+        ffn_weights.output_weight.kernel       = nullptr;
+        ffn_weights.output_weight.bias         = nullptr;
+        is_maintain_buffer                     = false;
+    }
+}
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+    if (!use_gptj_residual_) {
+        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    }
+
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    setWeightPtr();
+}
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>& GptNeoXDecoderLayerWeight<T>::operator=(const GptNeoXDecoderLayerWeight& other)
+{
+    hidden_units_      = other.hidden_units_;
+    inter_size_        = other.inter_size_;
+    tensor_para_size_  = other.tensor_para_size_;
+    tensor_para_rank_  = other.tensor_para_rank_;
+    use_gptj_residual_ = other.use_gptj_residual_;
+
+    mallocWeights();
+
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+    if (!use_gptj_residual_) {
+        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    }
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    setWeightPtr();
+    return *this;
+}
+
+template<typename T>
+void GptNeoXDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+{
+    FT_CHECK(is_maintain_buffer == true);
+    const std::string rank_spec = std::to_string(tensor_para_rank_);
+
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                         model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(3 * hidden_units_ / tensor_para_size_)},
+                         dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin",
+                         model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[4],
+                         {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                         model_file_type);
+
+    if (!use_gptj_residual_) {
+        loadWeightFromBin<T>(
+            weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type);
+    }
+
+    loadWeightFromBin<T>(weights_ptr[6],
+                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[7],
+                         {(size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
+                         model_file_type);
+    if (use_gptj_residual_) {
+        loadWeightFromBin<T>(
+            weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type);
+    }
+    else {
+        loadWeightFromBin<T>(
+            weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type);
+    }
+    loadWeightFromBin<T>(
+        weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+}
+
+template<typename T>
+void GptNeoXDecoderLayerWeight<T>::setWeightPtr()
+{
+    pre_layernorm_weights.beta                            = weights_ptr[0];
+    pre_layernorm_weights.gamma                           = weights_ptr[1];
+    self_attention_weights.query_weight.kernel            = weights_ptr[2];
+    self_attention_weights.query_weight.bias              = weights_ptr[3];
+    self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
+    self_attention_weights.attention_output_weight.bias   = use_gptj_residual_ ? nullptr : weights_ptr[5];
+
+    ffn_weights.intermediate_weight.kernel = weights_ptr[6];
+    ffn_weights.intermediate_weight.bias   = weights_ptr[7];
+    ffn_weights.output_weight.kernel       = weights_ptr[8];
+    ffn_weights.output_weight.bias         = weights_ptr[9];
+
+    post_attention_layernorm_weights.beta  = weights_ptr[10];
+    post_attention_layernorm_weights.gamma = weights_ptr[11];
+    is_maintain_buffer                     = true;
+}
+
+template<typename T>
+void GptNeoXDecoderLayerWeight<T>::mallocWeights()
+{
+    deviceMalloc(&weights_ptr[0], hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+    deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
+    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+    if (!use_gptj_residual_) {
+        deviceMalloc(&weights_ptr[5], hidden_units_);
+    }
+
+    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+    deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);
+    deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
+    deviceMalloc(&weights_ptr[9], hidden_units_);
+    deviceMalloc(&weights_ptr[10], hidden_units_);
+    deviceMalloc(&weights_ptr[11], hidden_units_);
+}
+
+template struct GptNeoXDecoderLayerWeight<float>;
+template struct GptNeoXDecoderLayerWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
new file mode 100644
index 000000000..2850da466
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct GptNeoXDecoderLayerWeight {
+public:
+    GptNeoXDecoderLayerWeight() = default;
+    GptNeoXDecoderLayerWeight(const int  hidden_units,
+                              const int  inter_size,
+                              const int  tensor_para_size  = 1,
+                              const int  tensor_para_rank  = 0,
+                              const bool use_gptj_residual = true);
+    ~GptNeoXDecoderLayerWeight();
+    GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other);
+    GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other);
+
+    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+
+    LayerNormWeight<T> pre_layernorm_weights;
+    AttentionWeight<T> self_attention_weights;
+    LayerNormWeight<T> post_attention_layernorm_weights;
+    FfnWeight<T>       ffn_weights;
+
+private:
+    int       hidden_units_;
+    int       inter_size_;
+    int       tensor_para_size_;
+    int       tensor_para_rank_;
+    bool      use_gptj_residual_;
+    const int attention_dense_bias_weight_id = 5;
+    bool      is_maintain_buffer             = false;
+    T*        weights_ptr[12];
+
+    void setWeightPtr();
+    void mallocWeights();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
new file mode 100644
index 000000000..26995f255
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidden_units,
+                                const int                                  inter_size,
+                                const int                                  vocab_size,
+                                const int                                  num_layer,
+                                const int                                  max_seq_len,
+                                const int                                  tensor_para_size,
+                                const int                                  tensor_para_rank,
+                                const int                                  layer_para_size,
+                                const int                                  layer_para_rank,
+                                const bool                                 use_gptj_residual,
+                                PromptLearningType                         prompt_learning_type,
+                                std::map<std::string, std::pair<int, int>> prompt_learning_pair):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    vocab_size_(vocab_size),
+    num_layer_(num_layer),
+    max_seq_len_(max_seq_len),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    layer_para_size_(layer_para_size),
+    layer_para_rank_(layer_para_rank),
+    use_gptj_residual_(use_gptj_residual),
+    prompt_learning_type_(prompt_learning_type),
+    prompt_learning_pair_(prompt_learning_pair)
+{
+    FT_CHECK(num_layer_ % layer_para_size_ == 0);
+    // set prompt weight size
+    if (prompt_learning_type_ == PromptLearningType::prefix_prompt) {
+        prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_;
+    }
+    else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) {
+        prompt_token_weight_size_ = hidden_units_;
+    }
+
+    // set if load and malloc prompt weights
+    malloc_load_prompt_weights_ = !prompt_learning_pair_.empty()
+                                  && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning
+                                      || prompt_learning_type_ == PromptLearningType::prefix_prompt);
+
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>(
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+        }
+        else {
+            // Layer-parallelism: allocate empty layer because
+            // this rank does not compute it:
+            decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>(0, 0));
+        }
+    }
+
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+GptNeoXWeight<T>::~GptNeoXWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < weights_ptr.size(); i++) {
+            deviceFree(weights_ptr[i]);
+        }
+
+        pre_decoder_embedding_table   = nullptr;
+        post_decoder_layernorm.beta   = nullptr;
+        post_decoder_layernorm.gamma  = nullptr;
+        post_decoder_embedding.kernel = nullptr;
+        is_maintain_buffer            = false;
+    }
+}
+
+template<typename T>
+GptNeoXWeight<T>::GptNeoXWeight(const GptNeoXWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    vocab_size_(other.vocab_size_),
+    num_layer_(other.num_layer_),
+    max_seq_len_(other.max_seq_len_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    layer_para_size_(other.layer_para_size_),
+    layer_para_rank_(other.layer_para_rank_),
+    use_gptj_residual_(other.use_gptj_residual_),
+    prompt_token_weight_size_(other.prompt_token_weight_size_),
+    malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
+    prompt_learning_type_(other.prompt_learning_type_),
+    prompt_learning_pair_(other.prompt_learning_pair_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning table: malloc weights and set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name     = prompt.first;
+            int         task_name_id  = prompt.second.first;
+            int         prompt_length = prompt.second.second;
+            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
+
+            // cuda device to device memcpy prompt table weights buffer memory
+            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
+        }
+    }
+
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+}
+
+template<typename T>
+GptNeoXWeight<T>& GptNeoXWeight<T>::operator=(const GptNeoXWeight& other)
+{
+    hidden_units_               = other.hidden_units_;
+    inter_size_                 = other.inter_size_;
+    vocab_size_                 = other.vocab_size_;
+    num_layer_                  = other.num_layer_;
+    max_seq_len_                = other.max_seq_len_;
+    tensor_para_size_           = other.tensor_para_size_;
+    tensor_para_rank_           = other.tensor_para_rank_;
+    layer_para_size_            = other.layer_para_size_;
+    layer_para_rank_            = other.layer_para_rank_;
+    use_gptj_residual_          = other.use_gptj_residual_;
+    prompt_token_weight_size_   = other.prompt_token_weight_size_;
+    malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
+    prompt_learning_type_       = other.prompt_learning_type_;
+    prompt_learning_pair_       = other.prompt_learning_pair_;
+
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning table: malloc weights and set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name     = prompt.first;
+            int         task_name_id  = prompt.second.first;
+            int         prompt_length = prompt.second.second;
+            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
+
+            // cuda device to device memcpy prompt table weights buffer memory
+            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
+        }
+    }
+
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+    return *this;
+}
+
+template<typename T>
+void GptNeoXWeight<T>::setWeightPtr()
+{
+    prompt_learning_table.resize(prompt_learning_pair_.size());
+
+    pre_decoder_embedding_table   = weights_ptr[0];
+    post_decoder_layernorm.beta   = weights_ptr[1];
+    post_decoder_layernorm.gamma  = weights_ptr[2];
+    post_decoder_embedding.kernel = weights_ptr[3];
+
+    // prompt learning tables: set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            int    task_name_id   = prompt.second.first;
+            int    prompt_length  = prompt.second.second;
+            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            // set weight ptr
+            prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length};
+        }
+    }
+}
+
+template<typename T>
+void GptNeoXWeight<T>::mallocWeights()
+{
+    weights_ptr.resize(num_base_weights + prompt_learning_pair_.size());
+
+    deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_);
+    deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning tables: malloc weights
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            int    task_name_id   = prompt.second.first;
+            int    prompt_length  = prompt.second.second;
+            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            // malloc weights
+            T* prompt_weights_ptr = nullptr;
+            deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_);
+            weights_ptr[task_weight_id] = prompt_weights_ptr;
+        }
+    }
+    is_maintain_buffer = true;
+}
+
+template<typename T>
+void GptNeoXWeight<T>::loadModel(std::string dir_path)
+{
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "gptneox");
+    FT_CHECK(is_maintain_buffer == true);
+
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(vocab_size_ * hidden_units_)},
+                         dir_path + "/model.lm_head.weight.bin",
+                         model_file_type);
+
+    // prompt table: load weights from bin
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name      = prompt.first;
+            int         task_name_id   = prompt.second.first;
+            int         prompt_length  = prompt.second.second;
+            size_t      task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ?
+                                                      (dir_path + "/model.prompt_table." + task_name + ".weight.bin") :
+                                                      (dir_path + "/model.prefix_prompt." + task_name + ".weight."
+                                                       + std::to_string(tensor_para_rank_) + ".bin");
+
+            if (prompt_length > 0) {
+                loadWeightFromBin<T>(weights_ptr[task_weight_id],
+                                     {(size_t)(prompt_length * (int)prompt_token_weight_size_)},
+                                     prompt_weight_path_name,
+                                     model_file_type);
+            }
+        }
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type);
+        }
+    }
+}
+
+template<typename T>
+void GptNeoXWeight<T>::resizeLayer(const int num_layer)
+{
+    num_layer_ = num_layer;
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>());
+    }
+}
+
+template<typename T>
+bool GptNeoXWeight<T>::isValidLayerParallelId(int l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_));
+    return l < num_layer_ && (l >= local_num_layer * layer_para_rank_)
+           && (l < local_num_layer * (layer_para_rank_ + 1));
+}
+
+template struct GptNeoXWeight<float>;
+template struct GptNeoXWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
new file mode 100644
index 000000000..3e868854e
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/prompt_learning.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct GptNeoXWeight {
+
+    GptNeoXWeight() = default;
+    GptNeoXWeight(
+        const int                                  hidden_units,
+        const int                                  inter_size,
+        const int                                  vocab_size,
+        const int                                  num_layer,
+        const int                                  max_seq_len,
+        const int                                  tensor_para_size     = 1,
+        const int                                  tensor_para_rank     = 0,
+        const int                                  layer_para_size      = 1,
+        const int                                  layer_para_rank      = 0,
+        const bool                                 use_gptj_residual_   = true,
+        PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
+        std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
+
+    ~GptNeoXWeight();
+    GptNeoXWeight(const GptNeoXWeight& other);
+    GptNeoXWeight& operator=(const GptNeoXWeight& other);
+
+    void loadModel(std::string dir_path);
+
+    void resizeLayer(const int num_layer);
+
+    std::vector<GptNeoXDecoderLayerWeight<T>*> decoder_layer_weights;
+    const T*                                   pre_decoder_embedding_table = nullptr;
+    // GPT-J does not use embedding table, but we leave the ptr such that
+    // GptNeoX::forward and Gpt::forward become identical
+    const T* position_encoding_table = nullptr;
+
+    /*
+        prompt_learning_pair = vectors of [weight ptr, prompt length] pair
+        prompt_length is stored here for compatible prompt learning table
+        prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
+        p/prompt tuning weights store as shape [prompt_len, hidden_units]
+        idx is the task_name_id of the prompt tables
+    */
+    std::vector<std::pair<const T*, int>> prompt_learning_table = {};
+
+    LayerNormWeight<T> post_decoder_layernorm;
+    DenseWeight<T>     post_decoder_embedding;
+
+    inline void setMaxSeqLen(size_t max_seq_len)
+    {
+        max_seq_len_ = max_seq_len;
+    }
+
+private:
+    void setWeightPtr();
+    void mallocWeights();
+    bool isValidLayerParallelId(int l);
+
+    int hidden_units_;
+    int inter_size_;
+    int vocab_size_;
+    int num_layer_;
+    int max_seq_len_;
+
+    int tensor_para_size_;
+    int tensor_para_rank_;
+    int layer_para_size_;
+    int layer_para_rank_;
+
+    // residual type
+    bool use_gptj_residual_;
+
+    // prompt learning pair (task_name, (task_name_id, prompt_len))
+    PromptLearningType                         prompt_learning_type_;
+    std::map<std::string, std::pair<int, int>> prompt_learning_pair_;
+    bool                                       malloc_load_prompt_weights_ = false;
+    // each prompt token's weight size
+    size_t prompt_token_weight_size_ = 0;
+
+    bool            is_maintain_buffer = false;
+    const size_t    num_base_weights   = 4;
+    std::vector<T*> weights_ptr        = std::vector<T*>(num_base_weights);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/th_op/llama/CMakeLists.txt b/src/fastertransformer/th_op/llama/CMakeLists.txt
new file mode 100755
index 000000000..75d13790e
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(th_llama STATIC LLaMA.cc)
+set_property(TARGET th_llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(th_llama PRIVATE "${TORCH_LIBRARIES}" LLaMA th_utils nccl_utils)
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
new file mode 100755
index 000000000..e913570cd
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/th_op/llama/LLaMA.h"
+
+namespace th = torch;
+namespace ft = fastertransformer;
+namespace torch_ext {
+
+LLaMA::LLaMA(const int64_t            head_num,
+                     const int64_t            size_per_head,
+                     const int64_t            inter_size,
+                     const int64_t            layer_num,
+                     const int64_t            vocab_size,
+                     const int64_t            rotary_embedding_dim,
+                     const int64_t            start_id,
+                     const int64_t            end_id,
+                     const int64_t            tensor_para_size,
+                     const int64_t            pipeline_para_size,
+                     const int64_t            max_seq_len,
+                     const bool               use_gptj_residual,
+                     const vector<th::Tensor> weights):
+    st_(weights[0].scalar_type())
+{
+    for (auto t : weights) {
+        CHECK_INPUT(t, st_);
+    }
+
+    switch (st_) {
+        case at::ScalarType::Float:
+            ftgpt = new FTGptNeoX<float>((size_t)head_num,
+                                         (size_t)size_per_head,
+                                         (size_t)inter_size,
+                                         (size_t)layer_num,
+                                         (size_t)vocab_size,
+                                         (size_t)rotary_embedding_dim,
+                                         start_id,
+                                         end_id,
+                                         tensor_para_size,
+                                         pipeline_para_size,
+                                         (size_t)max_seq_len,
+                                         use_gptj_residual,
+                                         weights);
+            break;
+        case at::ScalarType::Half:
+            ftgpt = new FTGptNeoX<half>((size_t)head_num,
+                                        (size_t)size_per_head,
+                                        (size_t)inter_size,
+                                        (size_t)layer_num,
+                                        (size_t)vocab_size,
+                                        (size_t)rotary_embedding_dim,
+                                        start_id,
+                                        end_id,
+                                        tensor_para_size,
+                                        pipeline_para_size,
+                                        (size_t)max_seq_len,
+                                        use_gptj_residual,
+                                        weights);
+            break;
+        default:
+            throw std::runtime_error("Wrong Tensor type.");
+    }
+}
+
+LLaMA::~LLaMA()
+{
+    delete ftgpt;
+}
+
+std::vector<th::Tensor> LLaMA::forward(th::Tensor               input_ids,
+                                           th::Tensor               input_lengths,
+                                           const int64_t            output_len,
+                                           th::optional<int64_t>    beam_width_opt,
+                                           th::optional<th::Tensor> top_k_opt,
+                                           th::optional<th::Tensor> top_p_opt,
+                                           th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                                           th::optional<th::Tensor> temperature_opt,
+                                           th::optional<th::Tensor> len_penalty_opt,
+                                           th::optional<th::Tensor> repetition_penalty_opt,
+                                           th::optional<th::Tensor> random_seed_opt,
+                                           th::optional<int64_t>    return_cum_log_probs_opt)
+{
+    CHECK_TH_CUDA(input_ids);
+    CHECK_CONTIGUOUS(input_ids);
+    TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32");
+    CHECK_TH_CUDA(input_lengths);
+    CHECK_CONTIGUOUS(input_lengths);
+    TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
+    int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0;
+    if (return_cum_log_probs_opt.has_value()) {
+        TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1,
+                    "return_cum_log_probs should be"
+                    " 0 (no return cum_log_probs), "
+                    " 1 (the cumulative log probs of generated sequences)")
+    }
+
+    const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1;
+
+    const int  batch_size               = input_ids.size(0);
+    const int  max_input_length         = input_ids.size(1);
+    const int  total_request_output_len = max_input_length + output_len;
+    th::Tensor output_ids               = torch::empty({batch_size, beam_width, total_request_output_len},
+                                         torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
+    th::Tensor sequence_lengths =
+        torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
+    th::Tensor cum_log_probs =
+        torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
+
+    ftgpt->forward(input_ids,
+                   input_lengths,
+                   output_ids,
+                   sequence_lengths,
+                   cum_log_probs,
+                   (const size_t)output_len,
+                   (const size_t)beam_width,
+                   top_k_opt,
+                   top_p_opt,
+                   beam_search_diversity_rate_opt,
+                   temperature_opt,
+                   len_penalty_opt,
+                   repetition_penalty_opt,
+                   random_seed_opt,
+                   return_cum_log_probs_opt);
+    if (return_cum_log_probs > 0) {
+        return std::vector<th::Tensor>{output_ids, sequence_lengths, cum_log_probs};
+    }
+    return std::vector<th::Tensor>{output_ids, sequence_lengths};
+}
+
+}  // namespace torch_ext
+
+static auto fasterTransformerGptTHS =
+#ifdef LEGACY_THS
+    torch::jit::class_<torch_ext::LLaMA>("FasterTransformerLLaMA")
+#else
+    torch::jit::class_<torch_ext::LLaMA>("FasterTransformer", "LLaMA")
+#endif
+        .def(torch::jit::init<int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              bool,
+                              std::vector<th::Tensor>>())
+        .def("forward", &torch_ext::LLaMA::forward);
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
new file mode 100755
index 000000000..3cca0bb19
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMA.h"
+#include "src/fastertransformer/th_op/th_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace ft = fastertransformer;
+namespace th = torch;
+namespace torch_ext {
+
+using std::vector;
+
+class IFLLaMA {
+public:
+    virtual ~IFLLaMA() {}
+    virtual void forward(th::Tensor&              input_ids,
+                         th::Tensor&              input_lengths,
+                         th::Tensor&              output_ids,
+                         th::Tensor&              sequence_lengths,
+                         th::Tensor&              cum_log_probs,
+                         const size_t             request_output_len,
+                         const size_t             beam_width,
+                         th::optional<th::Tensor> top_k_opt,
+                         th::optional<th::Tensor> top_p_opt,
+                         th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                         th::optional<th::Tensor> temperature_opt,
+                         th::optional<th::Tensor> len_penalty_opt,
+                         th::optional<th::Tensor> repetition_penalty_opt,
+                         th::optional<th::Tensor> random_seed_opt,
+                         th::optional<int64_t>    return_cum_log_probs_opt) = 0;
+};
+
+template<typename T>
+class FTLLaMA: public IFLLaMA {
+public:
+    FTLLaMA(const size_t             head_num,
+              const size_t             size_per_head,
+              const size_t             inter_size,
+              const size_t             layer_num,
+              const size_t             vocab_size,
+              const size_t             rotary_embedding_dim,
+              const int                start_id,
+              const int                end_id,
+              const int64_t            tensor_para_size,
+              const int64_t            pipeline_para_size,
+              const size_t             max_seq_len,
+              const bool               use_gptj_residual,
+              const vector<th::Tensor> weights):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size),
+        layer_num_(layer_num),
+        vocab_size_(vocab_size),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        start_id_(start_id),
+        end_id_(end_id),
+        use_gptj_residual_(use_gptj_residual),
+        weights_(weights),
+        tensor_para_size_(tensor_para_size),
+        pipeline_para_size_(pipeline_para_size)
+    {
+        ft::check_cuda_error(cublasLtCreate(&cublasltHandle_));
+        cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
+        cublas_wrapper_mutex_ = new std::mutex();
+
+        ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size);
+
+        gpt_weights_.resizeLayer(layer_num_);
+        for (int i = 0; i < (int)layer_num_; i++) {
+            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 0 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 1 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
+                get_ptr<T>(weights_[i + 2 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
+                get_ptr<T>(weights_[i + 3 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
+                get_ptr<T>(weights_[i + 4 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
+                get_ptr<T>(weights_[i + 5 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
+                get_ptr<T>(weights_[i + 6 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
+                get_ptr<T>(weights_[i + 7 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
+                get_ptr<T>(weights_[i + 8 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
+                get_ptr<T>(weights_[i + 9 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 10 * layer_num_]);
+            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 11 * layer_num_]);
+        }
+
+        gpt_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[12 * layer_num_ + 0]);
+        gpt_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[12 * layer_num_ + 1]);
+        gpt_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
+        gpt_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
+
+        gpt_weights_.setMaxSeqLen(max_seq_len);
+
+        ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
+    }
+
+    ~FTLLaMA() override
+    {
+        ft::ftNcclParamDestroy(tensor_para_);
+        ft::ftNcclParamDestroy(pipeline_para_);
+        cublasLtDestroy(cublasltHandle_);
+        delete cublas_algo_map_;
+        delete cublas_wrapper_mutex_;
+    }
+
+    void forward(th::Tensor&              input_ids,
+                 th::Tensor&              input_lengths,
+                 th::Tensor&              output_ids,
+                 th::Tensor&              sequence_lengths,
+                 th::Tensor&              cum_log_probs,
+                 const size_t             request_output_len,
+                 const size_t             beam_width,
+                 th::optional<th::Tensor> top_k_opt,
+                 th::optional<th::Tensor> top_p_opt,
+                 th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                 th::optional<th::Tensor> temperature_opt,
+                 th::optional<th::Tensor> len_penalty_opt,
+                 th::optional<th::Tensor> repetition_penalty_opt,
+                 th::optional<th::Tensor> random_seed_opt,
+                 th::optional<int64_t>    return_cum_log_probs_opt) override
+    {
+        int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0;
+
+        auto           stream       = at::cuda::getCurrentCUDAStream().stream();
+        cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
+        cublasSetStream(cublasHandle, stream);
+        ft::Allocator<ft::AllocatorType::TH> allocator      = ft::Allocator<ft::AllocatorType::TH>();
+        ft::cublasMMWrapper                  cublas_wrapper = ft::cublasMMWrapper(
+            cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator);
+
+        if (std::is_same<T, half>::value) {
+            cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+        }
+        else if (std::is_same<T, float>::value) {
+            cublas_wrapper.setFP32GemmConfig();
+        }
+
+        const size_t request_batch_size = (size_t)input_ids.size(0);
+        const size_t max_input_length   = (size_t)input_ids.size(1);
+        const int    total_output_len   = (int)(max_input_length + request_output_len);
+
+        ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
+                                                                   ft::getSMVersion(),
+                                                                   true,   // remove_padding
+                                                                   0,      // gpt supports any-seq-length fmha
+                                                                   true,   // is_fuse
+                                                                   false,  // with_relative_position_bias
+                                                                   true);  // causal_mask
+
+        ft::LLaMA<T> gpt = ft::LLaMA<T>(head_num_,
+                                            size_per_head_,
+                                            inter_size_,
+                                            layer_num_,
+                                            vocab_size_,
+                                            rotary_embedding_dim_,
+                                            start_id_,
+                                            end_id_,
+                                            end_id_ + 1,  // p/prompt tuning virtual token start id
+                                            ft::PromptLearningType::no_prompt,
+                                            use_gptj_residual_,
+                                            0.0f,  // beam_search_diversity_rate,
+                                            1,     // top_k,
+                                            0.0,   // top_p,
+                                            0,     // random_seed,
+                                            1.0f,  // temperature,
+                                            1.0f,  // len_penalty,
+                                            1.0f,  // repetition_penalty,
+                                            tensor_para_,
+                                            pipeline_para_,
+                                            stream,
+                                            &cublas_wrapper,
+                                            &allocator,
+                                            false,           // is_free_buffer_after_forward
+                                            &prop_,          // cuda_device_prop
+                                            attention_type,  // attention_type
+                                            nullptr,         // custom_all_reduce_comm
+                                            0);              // enable_custom_all_reduce
+
+        std::vector<uint32_t> output_seq_len(request_batch_size, total_output_len);
+
+        std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"input_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, max_input_length},
+                        get_ptr<int>(input_ids)}},
+            {"input_lengths",
+             ft::Tensor{
+                 ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{request_batch_size}, get_ptr<int>(input_lengths)}},
+            {"output_seq_len",
+             ft::Tensor{
+                 ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}}};
+        if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) {
+            input_tensors.insert(
+                {"beam_search_diversity_rate",
+                 convert_tensor<float>(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (top_p_opt.has_value()) {
+            input_tensors.insert(
+                {"runtime_top_p", convert_tensor<float>(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (top_k_opt.has_value()) {
+            input_tensors.insert(
+                {"runtime_top_k", convert_tensor<uint>(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (temperature_opt.has_value()) {
+            input_tensors.insert(
+                {"temperature", convert_tensor<float>(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (len_penalty_opt.has_value()) {
+            input_tensors.insert(
+                {"len_penalty", convert_tensor<float>(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (repetition_penalty_opt.has_value()) {
+            input_tensors.insert({"repetition_penalty",
+                                  convert_tensor<float>(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+        if (random_seed_opt.has_value()) {
+            input_tensors.insert(
+                {"random_seed",
+                 convert_tensor<unsigned long long int>(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)});
+        }
+
+        std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"output_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
+                        get_ptr<int>(output_ids)}},
+            {"sequence_length",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{request_batch_size, beam_width},
+                        get_ptr<int>(sequence_lengths)}}};
+
+        if (return_cum_log_probs > 0) {
+            output_tensors.insert({"cum_log_probs",
+                                   ft::Tensor{ft::MEMORY_GPU,
+                                              ft::TYPE_FP32,
+                                              std::vector<size_t>{request_batch_size, beam_width},
+                                              get_ptr<float>(cum_log_probs)}});
+        }
+
+        try {
+            gpt.forward(&output_tensors, &input_tensors, &gpt_weights_);
+        }
+        catch (std::runtime_error& error) {
+            std::cout << error.what();
+            exit(-1);
+        }
+        catch (...) {
+            std::cout << "Runtime error";
+            exit(-1);
+        }
+    }
+
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t inter_size_;
+    const size_t layer_num_;
+    const size_t vocab_size_;
+    const size_t rotary_embedding_dim_;
+    const int    start_id_;
+    const int    end_id_;
+    const bool   use_gptj_residual_;
+
+    // const ft::gptVariantParams gpt_variant_params_;
+
+    std::vector<th::Tensor> weights_;
+    cublasLtHandle_t        cublasltHandle_;
+    std::mutex*             cublas_wrapper_mutex_;
+    ft::cublasAlgoMap*      cublas_algo_map_;
+    struct cudaDeviceProp   prop_;
+    ft::LLaMAWeight<T>    gpt_weights_;
+
+    ft::NcclParam tensor_para_;
+    ft::NcclParam pipeline_para_;
+
+    int64_t tensor_para_size_;
+    int64_t pipeline_para_size_;
+};
+
+class LLaMA: public th::jit::CustomClassHolder {
+public:
+    LLaMA(const int64_t            head_num,
+              const int64_t            size_per_head,
+              const int64_t            inter_size,
+              const int64_t            layer_num,
+              const int64_t            vocab_size,
+              const int64_t            rotary_embedding_dim,
+              const int64_t            start_id,
+              const int64_t            end_id,
+              const int64_t            tensor_para_size,
+              const int64_t            pipeline_para_size,
+              const int64_t            max_seq_len,
+              const bool               use_gptj_residual,
+              const vector<th::Tensor> weights);
+
+    ~LLaMA();
+
+    vector<th::Tensor> forward(th::Tensor               input_ids,
+                               th::Tensor               input_lengths,
+                               const int64_t            output_len,
+                               th::optional<int64_t>    beam_width_opt,
+                               th::optional<th::Tensor> top_k_opt,
+                               th::optional<th::Tensor> top_p_opt,
+                               th::optional<th::Tensor> beam_search_diversity_rate_opt,
+                               th::optional<th::Tensor> temperature_opt,
+                               th::optional<th::Tensor> len_penalty_opt,
+                               th::optional<th::Tensor> repetition_penalty_opt,
+                               th::optional<th::Tensor> random_seed_opt,
+                               th::optional<int64_t>    return_cum_log_probs_opt);
+
+private:
+    const at::ScalarType    st_;
+    IFLLaMA*              ftgpt;
+    std::vector<th::Tensor> weights;
+};
+
+}  // namespace torch_ext

From a0276fbc0a08c409a0663042ba91516c8d1cf1ca Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 12 Sep 2023 00:24:21 +0000
Subject: [PATCH 03/55] rename varaible

---
 src/fastertransformer/models/llama/LLaMA.cc   | 128 +++++++++---------
 src/fastertransformer/models/llama/LLaMA.h    |  24 ++--
 .../models/llama/LLaMAContextDecoder.cc       |  60 ++++----
 .../models/llama/LLaMAContextDecoder.h        |  14 +-
 .../models/llama/LLaMADecoder.cc              |  58 ++++----
 .../models/llama/LLaMADecoder.h               |  14 +-
 .../models/llama/LLaMADecoderLayerWeight.cc   |  20 +--
 .../models/llama/LLaMADecoderLayerWeight.h    |  12 +-
 .../models/llama/LLaMAWeight.cc               |  32 ++---
 .../models/llama/LLaMAWeight.h                |  18 +--
 10 files changed, 190 insertions(+), 190 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 2ce2dae7b..4ac26a473 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/models/gptneox/GptNeoX.h"
+#include "src/fastertransformer/models/llama/LLaMA.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
@@ -24,9 +24,9 @@
 namespace fastertransformer {
 
 template<typename T>
-void GptNeoX<T>::initialize()
+void LLaMA<T>::initialize()
 {
-    gpt_context_decoder_ = new GptNeoXContextDecoder<T>(head_num_,
+    llama_context_decoder_ = new LLaMAContextDecoder<T>(head_num_,
                                                         size_per_head_,
                                                         inter_size_,
                                                         num_layer_,
@@ -45,7 +45,7 @@ void GptNeoX<T>::initialize()
                                                         custom_all_reduce_comm_,
                                                         enable_custom_all_reduce_);
 
-    gpt_decoder_ = new GptNeoXDecoder<T>(head_num_,
+    llama_decoder_ = new LLaMADecoder<T>(head_num_,
                                          size_per_head_,
                                          inter_size_,
                                          num_layer_,
@@ -73,13 +73,13 @@ void GptNeoX<T>::initialize()
 }
 
 template<typename T>
-void GptNeoX<T>::allocateBuffer()
+void LLaMA<T>::allocateBuffer()
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void GptNeoX<T>::allocateBuffer(
+void LLaMA<T>::allocateBuffer(
     size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
@@ -154,7 +154,7 @@ void GptNeoX<T>::allocateBuffer(
 }
 
 template<typename T>
-void GptNeoX<T>::freeBuffer()
+void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
         if (vocab_size_ != vocab_size_padded_) {
@@ -206,7 +206,7 @@ void GptNeoX<T>::freeBuffer()
 }
 
 template<typename T>
-GptNeoX<T>::GptNeoX(size_t                              head_num,
+LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              size_per_head,
                     size_t                              inter_size,
                     size_t                              num_layer,
@@ -262,7 +262,7 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
 }
 
 template<typename T>
-GptNeoX<T>::GptNeoX(size_t                              head_num,
+LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              size_per_head,
                     size_t                              inter_size,
                     size_t                              num_layer,
@@ -319,66 +319,66 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
 }
 
 template<typename T>
-GptNeoX<T>::GptNeoX(GptNeoX<T> const& gpt):
-    BaseLayer(gpt),
-    head_num_(gpt.head_num_),
-    size_per_head_(gpt.size_per_head_),
-    inter_size_(gpt.inter_size_),
-    num_layer_(gpt.num_layer_),
-    vocab_size_(gpt.vocab_size_),
-    rotary_embedding_dim_(gpt.rotary_embedding_dim_),
-    start_id_(gpt.start_id_),
-    end_id_(gpt.end_id_),
-    prompt_learning_start_id_(gpt.prompt_learning_start_id_),
-    prompt_learning_type_(gpt.prompt_learning_type_),
-    use_gptj_residual_(gpt.use_gptj_residual_),
-    hidden_units_(gpt.hidden_units_),
-    tensor_para_(gpt.tensor_para_),
-    pipeline_para_(gpt.pipeline_para_),
-    local_head_num_(gpt.local_head_num_),
-    vocab_size_padded_(gpt.vocab_size_padded_),
-    custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
-    enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
-    attention_type_(gpt.attention_type_)
+LLaMA<T>::LLaMA(LLaMA<T> const& llama):
+    BaseLayer(llama),
+    head_num_(llama.head_num_),
+    size_per_head_(llama.size_per_head_),
+    inter_size_(llama.inter_size_),
+    num_layer_(llama.num_layer_),
+    vocab_size_(llama.vocab_size_),
+    rotary_embedding_dim_(llama.rotary_embedding_dim_),
+    start_id_(llama.start_id_),
+    end_id_(llama.end_id_),
+    prompt_learning_start_id_(llama.prompt_learning_start_id_),
+    prompt_learning_type_(llama.prompt_learning_type_),
+    use_gptj_residual_(llama.use_gptj_residual_),
+    hidden_units_(llama.hidden_units_),
+    tensor_para_(llama.tensor_para_),
+    pipeline_para_(llama.pipeline_para_),
+    local_head_num_(llama.local_head_num_),
+    vocab_size_padded_(llama.vocab_size_padded_),
+    custom_all_reduce_comm_(llama.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(llama.enable_custom_all_reduce_),
+    attention_type_(llama.attention_type_)
 {
     initialize();
 }
 
 template<typename T>
-GptNeoX<T>::~GptNeoX()
+LLaMA<T>::~LLaMA()
 {
-    delete gpt_decoder_;
+    delete llama_decoder_;
     delete dynamic_decode_layer_;
-    delete gpt_context_decoder_;
+    delete llama_context_decoder_;
     freeBuffer();
 }
 
 template<typename T>
-void GptNeoX<T>::registerCallback(callback_sig* fn, void* ctx)
+void LLaMA<T>::registerCallback(callback_sig* fn, void* ctx)
 {
     token_generated_cb_  = fn;
     token_generated_ctx_ = ctx;
 }
 
 template<typename T>
-void GptNeoX<T>::unRegisterCallback()
+void LLaMA<T>::unRegisterCallback()
 {
     token_generated_cb_  = nullptr;
     token_generated_ctx_ = nullptr;
 }
 
 template<typename T>
-void GptNeoX<T>::forward(std::vector<Tensor>*       output_tensors,
+void LLaMA<T>::forward(std::vector<Tensor>*       output_tensors,
                          const std::vector<Tensor>* input_tensors,
-                         const GptNeoXWeight<T>*    gpt_weights)
+                         const LLaMAWeight<T>*    llama_weights)
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
                          const std::unordered_map<std::string, Tensor>* input_tensors,
-                         const GptNeoXWeight<T>*                        gpt_weights)
+                         const LLaMAWeight<T>*                        llama_weights)
 {
     // input_tensors:
     //      input_ids [batch_size, max_input_length]
@@ -478,7 +478,7 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
             // throw errors when prompt task_name_ids are not found
             std::pair<const T*, int> prefix_prompt_weight_length_pair;
             try {
-                prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id);
+                prefix_prompt_weight_length_pair = llama_weights->prompt_learning_table.at(task_id);
             }
             catch (const std::out_of_range& oor) {
                 FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id);
@@ -594,8 +594,8 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
             param.from_tensor                   = context_decoder_input_buf_;
             param.output_ids                    = output_ids_buf_;
             param.input_lengths                 = tiled_input_lengths_buf_;
-            param.embedding_table               = gpt_weights->pre_decoder_embedding_table;
-            param.pos_table                     = gpt_weights->position_encoding_table;
+            param.embedding_table               = llama_weights->pre_decoder_embedding_table;
+            param.pos_table                     = llama_weights->position_encoding_table;
             param.prefix_soft_prompt_embedding  = input_tensors->at("request_prompt_embedding").getPtr<float>();
             param.prefix_soft_prompt_lengths    = input_tensors->at("request_prompt_lengths").getPtr<int>();
             param.input_ids                     = tiled_input_ids_buf_;
@@ -614,8 +614,8 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
         else {
             invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
                                                      output_ids_buf_,
-                                                     gpt_weights->pre_decoder_embedding_table,
-                                                     gpt_weights->position_encoding_table,
+                                                     llama_weights->pre_decoder_embedding_table,
+                                                     llama_weights->position_encoding_table,
                                                      pPromptTuningParam<T>{},  // no p/prompt tuning
                                                      tiled_input_ids_buf_,
                                                      1,
@@ -673,8 +673,8 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
             {"last_token_hidden_units",
              Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
 
-        gpt_context_decoder_->forward(
-            &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+        llama_context_decoder_->forward(
+            &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
         sync_check_cuda_error();
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
@@ -739,16 +739,16 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
     }
 
     if (vocab_size_ == vocab_size_padded_) {
-        padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel;
+        padded_embedding_kernel_ptr_ = llama_weights->post_decoder_embedding.kernel;
     }
     else {
         cudaMemcpyAsync(padded_embedding_kernel_,
-                        gpt_weights->post_decoder_embedding.kernel,
+                        llama_weights->post_decoder_embedding.kernel,
                         sizeof(T) * vocab_size_ * hidden_units_,
                         cudaMemcpyDeviceToDevice,
                         stream_);
         cudaMemcpyAsync(padded_embedding_bias_,
-                        gpt_weights->post_decoder_embedding.bias,
+                        llama_weights->post_decoder_embedding.bias,
                         sizeof(T) * vocab_size_,
                         cudaMemcpyDeviceToDevice,
                         stream_);
@@ -782,8 +782,8 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
             if (!(max_input_length > 1 && step == max_input_length)) {
                 if (pipeline_para_.rank_ == 0) {
                     invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset,
-                                                             gpt_weights->pre_decoder_embedding_table,
-                                                             gpt_weights->position_encoding_table,
+                                                             llama_weights->pre_decoder_embedding_table,
+                                                             llama_weights->position_encoding_table,
                                                              output_ids_buf_ + id_offset,
                                                              tiled_total_padding_count_ + id_offset,
                                                              local_batch_size * beam_width,
@@ -838,15 +838,15 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
                             decoder_output_buf_ + hidden_units_offset}},
                     {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
                     {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
-                gpt_decoder_->forward(
-                    &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+                llama_decoder_->forward(
+                    &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
             }
 
             if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
                 invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
                                        decoder_output_buf_ + hidden_units_offset,
-                                       gpt_weights->post_decoder_layernorm.gamma,
-                                       gpt_weights->post_decoder_layernorm.beta,
+                                       llama_weights->post_decoder_layernorm.gamma,
+                                       llama_weights->post_decoder_layernorm.beta,
                                        layernorm_eps_,
                                        local_batch_size * beam_width,
                                        hidden_units_,
@@ -1045,7 +1045,7 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
 }
 
 template<typename T>
-void GptNeoX<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+void LLaMA<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                                 const std::unordered_map<std::string, Tensor>* input_tensors)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
@@ -1080,7 +1080,7 @@ void GptNeoX<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string,
 }
 
 template<typename T>
-void GptNeoX<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
+void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
                                   const std::unordered_map<std::string, Tensor>* input_tensors,
                                   const size_t                                   max_input_length,
                                   const size_t                                   max_output_seq_len)
@@ -1176,36 +1176,36 @@ void GptNeoX<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*
 }
 
 template<typename T>
-size_t GptNeoX<T>::getPipelineParallelRank()
+size_t LLaMA<T>::getPipelineParallelRank()
 {
     return pipeline_para_.rank_;
 }
 
 template<typename T>
-size_t GptNeoX<T>::getPipelineParallelSize()
+size_t LLaMA<T>::getPipelineParallelSize()
 {
     return pipeline_para_.world_size_;
 }
 
 template<typename T>
-size_t GptNeoX<T>::getTensorParallelRank()
+size_t LLaMA<T>::getTensorParallelRank()
 {
     return tensor_para_.rank_;
 }
 
 template<typename T>
-size_t GptNeoX<T>::getTensorParallelSize()
+size_t LLaMA<T>::getTensorParallelSize()
 {
     return tensor_para_.world_size_;
 }
 
 template<typename T>
-bool* GptNeoX<T>::getFinishBuffer()
+bool* LLaMA<T>::getFinishBuffer()
 {
     return finished_buf_;
 }
 
-template class GptNeoX<float>;
-template class GptNeoX<half>;
+template class LLaMA<float>;
+template class LLaMA<half>;
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 9749a2070..5cf7b0025 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -20,16 +20,16 @@
 #include <vector>
 
 #include "src/fastertransformer/layers/DynamicDecodeLayer.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h"
+#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
+#include "src/fastertransformer/models/llama/LLaMADecoder.h"
+#include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 #include "src/fastertransformer/utils/prompt_learning.h"
 
 namespace fastertransformer {
 
 template<typename T>
-class GptNeoX: public BaseLayer {
+class LLaMA: public BaseLayer {
 private:
     // meta data
     size_t head_num_;
@@ -69,8 +69,8 @@ class GptNeoX: public BaseLayer {
     bool               has_prefix_prompt_;
     bool               has_prefix_soft_prompt_;
 
-    GptNeoXDecoder<T>*         gpt_decoder_;
-    GptNeoXContextDecoder<T>*  gpt_context_decoder_;
+    LLaMADecoder<T>*         llama_decoder_;
+    LLaMAContextDecoder<T>*  llama_context_decoder_;
     DynamicDecodeLayer<float>* dynamic_decode_layer_;
 
     void allocateBuffer() override;
@@ -137,7 +137,7 @@ class GptNeoX: public BaseLayer {
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
 
 public:
-    GptNeoX(size_t                              head_num,
+    LLaMA(size_t                              head_num,
             size_t                              size_per_head,
             size_t                              inter_size,
             size_t                              num_layer,
@@ -164,7 +164,7 @@ class GptNeoX: public BaseLayer {
             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
             int                                 enable_custom_all_reduce = 0);
 
-    GptNeoX(size_t                              head_num,
+    LLaMA(size_t                              head_num,
             size_t                              size_per_head,
             size_t                              inter_size,
             size_t                              num_layer,
@@ -193,17 +193,17 @@ class GptNeoX: public BaseLayer {
             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
             int                                 enable_custom_all_reduce = 0);
 
-    GptNeoX(GptNeoX<T> const& GptNeoX);
+    LLaMA(LLaMA<T> const& LLaMA);
 
-    ~GptNeoX();
+    ~LLaMA();
 
     void forward(std::vector<Tensor>*       output_tensors,
                  const std::vector<Tensor>* input_tensors,
-                 const GptNeoXWeight<T>*    gpt_weights);
+                 const LLaMAWeight<T>*    llama_weights);
 
     void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
                  const std::unordered_map<std::string, Tensor>* input_tensors,
-                 const GptNeoXWeight<T>*                        gpt_weights);
+                 const LLaMAWeight<T>*                        llama_weights);
 
     size_t getPipelineParallelRank();
     size_t getPipelineParallelSize();
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index f23d1a977..69ed839a3 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h"
+#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 
@@ -24,7 +24,7 @@
 namespace fastertransformer {
 
 template<typename T>
-void GptNeoXContextDecoder<T>::initialize()
+void LLaMAContextDecoder<T>::initialize()
 {
     self_attention_layer_ = new TensorParallelGptContextAttentionLayer<T>(0,  // max_batch_size
                                                                           0,  // max_seq_len
@@ -64,13 +64,13 @@ void GptNeoXContextDecoder<T>::initialize()
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::allocateBuffer()
+void LLaMAContextDecoder<T>::allocateBuffer()
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -88,7 +88,7 @@ void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::freeBuffer()
+void LLaMAContextDecoder<T>::freeBuffer()
 {
     if (is_allocate_buffer_ == true) {
         allocator_->free((void**)(&decoder_normed_input_));
@@ -103,7 +103,7 @@ void GptNeoXContextDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-bool GptNeoXContextDecoder<T>::isValidLayerParallelId(uint l)
+bool LLaMAContextDecoder<T>::isValidLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
@@ -111,28 +111,28 @@ bool GptNeoXContextDecoder<T>::isValidLayerParallelId(uint l)
 }
 
 template<typename T>
-bool GptNeoXContextDecoder<T>::isFirstLayerParallelId(uint l)
+bool LLaMAContextDecoder<T>::isFirstLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
 }
 
 template<typename T>
-bool GptNeoXContextDecoder<T>::isLastLayerParallelId(uint l)
+bool LLaMAContextDecoder<T>::isLastLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
 }
 
 template<typename T>
-int GptNeoXContextDecoder<T>::getFirstLayerParallelId()
+int LLaMAContextDecoder<T>::getFirstLayerParallelId()
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return local_num_layer * pipeline_para_.rank_;
 }
 
 template<typename T>
-GptNeoXContextDecoder<T>::GptNeoXContextDecoder(size_t                              head_num,
+LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t                              head_num,
                                                 size_t                              size_per_head,
                                                 size_t                              inter_size,
                                                 size_t                              num_layer,
@@ -171,7 +171,7 @@ GptNeoXContextDecoder<T>::GptNeoXContextDecoder(size_t
 }
 
 template<typename T>
-GptNeoXContextDecoder<T>::GptNeoXContextDecoder(GptNeoXContextDecoder<T> const& decoder):
+LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder):
     BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
     head_num_(decoder.head_num_),
     size_per_head_(decoder.size_per_head_),
@@ -193,7 +193,7 @@ GptNeoXContextDecoder<T>::GptNeoXContextDecoder(GptNeoXContextDecoder<T> const&
 }
 
 template<typename T>
-GptNeoXContextDecoder<T>::~GptNeoXContextDecoder()
+LLaMAContextDecoder<T>::~LLaMAContextDecoder()
 {
     delete self_attention_layer_;
     delete ffn_layer_;
@@ -201,9 +201,9 @@ GptNeoXContextDecoder<T>::~GptNeoXContextDecoder()
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
                                        const std::vector<Tensor>*                        input_tensors,
-                                       const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+                                       const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
                                                               {"attention_mask", input_tensors->at(1)},
@@ -213,13 +213,13 @@ void GptNeoXContextDecoder<T>::forward(std::vector<Tensor>*
                                                                {"value_cache", output_tensors->at(2)},
                                                                {"last_token_hidden_units", output_tensors->at(3)}};
 
-    forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight);
+    forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight);
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
                                        const std::unordered_map<std::string, Tensor>*    input_tensors,
-                                       const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+                                       const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     // input tensors:
     //      decoder_input [batch_size, seq_len, hidden_dimension],
@@ -332,8 +332,8 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
             invokeGeneralLayerNorm(decoder_normed_input_,
                                    layer_input,
-                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                   llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                   llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
                                    layernorm_eps_,
                                    h_token_num,
                                    hidden_units_,
@@ -393,14 +393,14 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
-                                           &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+                                           &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
             if (is_final == false) {
                 if (use_gptj_residual_) {
                     invokeGeneralLayerNorm(decoder_normed_input_,
                                            layer_input,
-                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                           llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                           llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
                                            layernorm_eps_,
                                            h_token_num,
                                            hidden_units_,
@@ -414,9 +414,9 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                         decoder_normed_input_,
                         self_attn_output_,
                         layer_input,
-                        gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                        gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                        gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                        llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                        llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                        llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
                         layernorm_eps_,
                         h_token_num,
                         hidden_units_,
@@ -437,7 +437,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                                       {h_token_num, (size_t)hidden_units_},
                                                       use_gptj_residual_ ? ffn_output_ : layer_output}}});
                 ffn_layer_->forward(
-                    &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+                    &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
                 if (use_gptj_residual_) {
                     // Original workflow:
@@ -451,7 +451,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                                       ffn_output_,
                                                       self_attn_output_,
                                                       layer_input,
-                                                      gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                                      llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
                                                       h_token_num,
                                                       hidden_units_,
                                                       tensor_para_.world_size_,
@@ -464,7 +464,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 else {
                     invokeAddBiasResidual(layer_output,
                                           self_attn_output_,
-                                          gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                          llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
                                           h_token_num,
                                           hidden_units_,
                                           stream_);
@@ -508,7 +508,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     }
 }
 
-template class GptNeoXContextDecoder<float>;
-template class GptNeoXContextDecoder<half>;
+template class LLaMAContextDecoder<float>;
+template class LLaMAContextDecoder<half>;
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index c81dcfe90..b84285f14 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -23,7 +23,7 @@
 #include "src/fastertransformer/layers/BaseLayer.h"
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
 #include "src/fastertransformer/utils/Tensor.h"
 #include "src/fastertransformer/utils/allocator.h"
 #include "src/fastertransformer/utils/cublasMMWrapper.h"
@@ -33,7 +33,7 @@
 namespace fastertransformer {
 
 template<typename T>
-class GptNeoXContextDecoder: public BaseLayer {
+class LLaMAContextDecoder: public BaseLayer {
 private:
     // meta data
     size_t head_num_;
@@ -82,7 +82,7 @@ class GptNeoXContextDecoder: public BaseLayer {
     int*    cu_seqlens_             = nullptr;
 
 public:
-    GptNeoXContextDecoder(size_t                              head_num,
+    LLaMAContextDecoder(size_t                              head_num,
                           size_t                              size_per_head,
                           size_t                              inter_size,
                           size_t                              num_layer,
@@ -101,17 +101,17 @@ class GptNeoXContextDecoder: public BaseLayer {
                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                           int                                 enable_custom_all_reduce_ = 0);
 
-    GptNeoXContextDecoder(GptNeoXContextDecoder<T> const& decoder);
+    LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
 
-    ~GptNeoXContextDecoder();
+    ~LLaMAContextDecoder();
 
     void forward(std::vector<Tensor>*                              output_tensors,
                  const std::vector<Tensor>*                        input_tensors,
-                 const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+                 const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
 
     void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
                  const std::unordered_map<std::string, Tensor>*    input_tensors,
-                 const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight);
+                 const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc
index 7b73ba8ee..3a8fc1458 100644
--- a/src/fastertransformer/models/llama/LLaMADecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoder.cc
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoder.h"
+#include "src/fastertransformer/models/llama/LLaMADecoder.h"
 #include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h"
 
 namespace fastertransformer {
 
 template<typename T>
-void GptNeoXDecoder<T>::initialize()
+void LLaMADecoder<T>::initialize()
 {
     self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer<T>(0,  // max_batch_size
                                                                            head_num_,
@@ -59,13 +59,13 @@ void GptNeoXDecoder<T>::initialize()
 }
 
 template<typename T>
-void GptNeoXDecoder<T>::allocateBuffer()
+void LLaMADecoder<T>::allocateBuffer()
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void GptNeoXDecoder<T>::allocateBuffer(size_t batch_size)
+void LLaMADecoder<T>::allocateBuffer(size_t batch_size)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false));
@@ -79,7 +79,7 @@ void GptNeoXDecoder<T>::allocateBuffer(size_t batch_size)
 }
 
 template<typename T>
-void GptNeoXDecoder<T>::freeBuffer()
+void LLaMADecoder<T>::freeBuffer()
 {
     if (is_allocate_buffer_ == true) {
         allocator_->free((void**)(&decoder_normed_input_));
@@ -91,7 +91,7 @@ void GptNeoXDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-bool GptNeoXDecoder<T>::isValidLayerParallelId(uint l)
+bool LLaMADecoder<T>::isValidLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
@@ -99,28 +99,28 @@ bool GptNeoXDecoder<T>::isValidLayerParallelId(uint l)
 }
 
 template<typename T>
-bool GptNeoXDecoder<T>::isFirstLayerParallelId(uint l)
+bool LLaMADecoder<T>::isFirstLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
 }
 
 template<typename T>
-bool GptNeoXDecoder<T>::isLastLayerParallelId(uint l)
+bool LLaMADecoder<T>::isLastLayerParallelId(uint l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
 }
 
 template<typename T>
-int GptNeoXDecoder<T>::getFirstLayerParallelId()
+int LLaMADecoder<T>::getFirstLayerParallelId()
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
     return local_num_layer * pipeline_para_.rank_;
 }
 
 template<typename T>
-GptNeoXDecoder<T>::GptNeoXDecoder(size_t                              head_num,
+LLaMADecoder<T>::LLaMADecoder(size_t                              head_num,
                                   size_t                              size_per_head,
                                   size_t                              inter_size,
                                   size_t                              num_layer,
@@ -155,7 +155,7 @@ GptNeoXDecoder<T>::GptNeoXDecoder(size_t                              head_num,
 }
 
 template<typename T>
-GptNeoXDecoder<T>::GptNeoXDecoder(GptNeoXDecoder<T> const& decoder):
+LLaMADecoder<T>::LLaMADecoder(LLaMADecoder<T> const& decoder):
     BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
     head_num_(decoder.head_num_),
     size_per_head_(decoder.size_per_head_),
@@ -175,7 +175,7 @@ GptNeoXDecoder<T>::GptNeoXDecoder(GptNeoXDecoder<T> const& decoder):
 }
 
 template<typename T>
-GptNeoXDecoder<T>::~GptNeoXDecoder()
+LLaMADecoder<T>::~LLaMADecoder()
 {
     delete self_attention_layer_;
     delete ffn_layer_;
@@ -183,17 +183,17 @@ GptNeoXDecoder<T>::~GptNeoXDecoder()
 }
 
 template<typename T>
-void GptNeoXDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+void LLaMADecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
                                 const std::vector<Tensor>*                        input_tensors,
-                                const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+                                const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
                                 const std::unordered_map<std::string, Tensor>*    input_tensors,
-                                const std::vector<GptNeoXDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+                                const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     // input tensors:
     //      decoder_input [local_batch_size, hidden_dimension],
@@ -263,8 +263,8 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         invokeGeneralLayerNorm(decoder_normed_input_,
                                layer_input,
-                               gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                               gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                               llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                               llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
                                layernorm_eps_,
                                local_batch_size,
                                hidden_units_,
@@ -294,12 +294,12 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
-                                       &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
         if (use_gptj_residual_) {
             invokeGeneralLayerNorm(decoder_normed_input_,
                                    layer_input,
-                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                   llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                   llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
                                    layernorm_eps_,
                                    local_batch_size,
                                    hidden_units_,
@@ -313,9 +313,9 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 decoder_normed_input_,
                 self_attn_output_,
                 layer_input,
-                gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                gpt_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
                 layernorm_eps_,
                 local_batch_size,
                 hidden_units_,
@@ -334,7 +334,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                               data_type,
                                               {local_batch_size, hidden_units_},
                                               use_gptj_residual_ ? ffn_output_ : layer_output}}});
-        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
         if (use_gptj_residual_) {
             // Original workflow:
@@ -346,7 +346,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                               ffn_output_,
                                               self_attn_output_,
                                               layer_input,
-                                              gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
                                               local_batch_size,
                                               hidden_units_,
                                               tensor_para_.world_size_,
@@ -358,7 +358,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         else {
             invokeAddBiasResidual(layer_output,
                                   self_attn_output_,
-                                  gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                  llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
                                   local_batch_size,
                                   hidden_units_,
                                   stream_);
@@ -385,7 +385,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     }
 }
 
-template class GptNeoXDecoder<float>;
-template class GptNeoXDecoder<half>;
+template class LLaMADecoder<float>;
+template class LLaMADecoder<half>;
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h
index add736adc..cbbc272ff 100644
--- a/src/fastertransformer/models/llama/LLaMADecoder.h
+++ b/src/fastertransformer/models/llama/LLaMADecoder.h
@@ -23,7 +23,7 @@
 #include "src/fastertransformer/layers/BaseLayer.h"
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
 #include "src/fastertransformer/utils/Tensor.h"
 #include "src/fastertransformer/utils/allocator.h"
 #include "src/fastertransformer/utils/cublasMMWrapper.h"
@@ -33,7 +33,7 @@
 namespace fastertransformer {
 
 template<typename T>
-class GptNeoXDecoder: public BaseLayer {
+class LLaMADecoder: public BaseLayer {
 private:
 protected:
     void         allocateBuffer() override;
@@ -71,7 +71,7 @@ class GptNeoXDecoder: public BaseLayer {
     FfnLayer<T>*           ffn_layer_;
 
 public:
-    GptNeoXDecoder(size_t                              head_num,
+    LLaMADecoder(size_t                              head_num,
                    size_t                              size_per_head,
                    size_t                              inter_size,
                    size_t                              num_layer,
@@ -88,17 +88,17 @@ class GptNeoXDecoder: public BaseLayer {
                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                    int                                 enable_custom_all_reduce_ = 0);
 
-    GptNeoXDecoder(GptNeoXDecoder<T> const& decoder);
+    LLaMADecoder(LLaMADecoder<T> const& decoder);
 
-    virtual ~GptNeoXDecoder();
+    virtual ~LLaMADecoder();
 
     virtual void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
                          const std::unordered_map<std::string, Tensor>*    input_tensors,
-                         const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+                         const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
 
     virtual void forward(std::vector<Tensor>*                              output_tensors,
                          const std::vector<Tensor>*                        input_tensors,
-                         const std::vector<GptNeoXDecoderLayerWeight<T>*>* decoder_layer_weights);
+                         const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
index 3d62df83d..9ed355047 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
 #include "src/fastertransformer/utils/memory_utils.h"
 
 namespace fastertransformer {
 
 template<typename T>
-GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const int  hidden_units,
+LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const int  hidden_units,
                                                         const int  inter_size,
                                                         const int  tensor_para_size,
                                                         const int  tensor_para_rank,
@@ -36,7 +36,7 @@ GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const int  hidden_units,
 }
 
 template<typename T>
-GptNeoXDecoderLayerWeight<T>::~GptNeoXDecoderLayerWeight()
+LLaMADecoderLayerWeight<T>::~LLaMADecoderLayerWeight()
 {
     if (is_maintain_buffer == true) {
         for (int i = 0; i < 12; i++) {
@@ -63,7 +63,7 @@ GptNeoXDecoderLayerWeight<T>::~GptNeoXDecoderLayerWeight()
 }
 
 template<typename T>
-GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other):
+LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other):
     hidden_units_(other.hidden_units_),
     inter_size_(other.inter_size_),
     tensor_para_size_(other.tensor_para_size_),
@@ -90,7 +90,7 @@ GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLaye
 }
 
 template<typename T>
-GptNeoXDecoderLayerWeight<T>& GptNeoXDecoderLayerWeight<T>::operator=(const GptNeoXDecoderLayerWeight& other)
+LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADecoderLayerWeight& other)
 {
     hidden_units_      = other.hidden_units_;
     inter_size_        = other.inter_size_;
@@ -119,7 +119,7 @@ GptNeoXDecoderLayerWeight<T>& GptNeoXDecoderLayerWeight<T>::operator=(const GptN
 }
 
 template<typename T>
-void GptNeoXDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
 {
     FT_CHECK(is_maintain_buffer == true);
     const std::string rank_spec = std::to_string(tensor_para_rank_);
@@ -175,7 +175,7 @@ void GptNeoXDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataTyp
 }
 
 template<typename T>
-void GptNeoXDecoderLayerWeight<T>::setWeightPtr()
+void LLaMADecoderLayerWeight<T>::setWeightPtr()
 {
     pre_layernorm_weights.beta                            = weights_ptr[0];
     pre_layernorm_weights.gamma                           = weights_ptr[1];
@@ -195,7 +195,7 @@ void GptNeoXDecoderLayerWeight<T>::setWeightPtr()
 }
 
 template<typename T>
-void GptNeoXDecoderLayerWeight<T>::mallocWeights()
+void LLaMADecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
@@ -214,7 +214,7 @@ void GptNeoXDecoderLayerWeight<T>::mallocWeights()
     deviceMalloc(&weights_ptr[11], hidden_units_);
 }
 
-template struct GptNeoXDecoderLayerWeight<float>;
-template struct GptNeoXDecoderLayerWeight<half>;
+template struct LLaMADecoderLayerWeight<float>;
+template struct LLaMADecoderLayerWeight<half>;
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
index 2850da466..44726f58c 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
@@ -26,17 +26,17 @@
 namespace fastertransformer {
 
 template<typename T>
-struct GptNeoXDecoderLayerWeight {
+struct LLaMADecoderLayerWeight {
 public:
-    GptNeoXDecoderLayerWeight() = default;
-    GptNeoXDecoderLayerWeight(const int  hidden_units,
+    LLaMADecoderLayerWeight() = default;
+    LLaMADecoderLayerWeight(const int  hidden_units,
                               const int  inter_size,
                               const int  tensor_para_size  = 1,
                               const int  tensor_para_rank  = 0,
                               const bool use_gptj_residual = true);
-    ~GptNeoXDecoderLayerWeight();
-    GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other);
-    GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other);
+    ~LLaMADecoderLayerWeight();
+    LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other);
+    LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other);
 
     void loadModel(std::string dir_path, FtCudaDataType model_file_type);
 
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index 26995f255..cc8c5ab25 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/models/gptneox/GptNeoXWeight.h"
+#include "src/fastertransformer/models/llama/LLaMAWeight.h"
 
 namespace fastertransformer {
 
 template<typename T>
-GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidden_units,
+LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_units,
                                 const int                                  inter_size,
                                 const int                                  vocab_size,
                                 const int                                  num_layer,
@@ -61,13 +61,13 @@ GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidde
     decoder_layer_weights.reserve(num_layer_);
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
-            decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>(
+            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(
                 hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
             // this rank does not compute it:
-            decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>(0, 0));
+            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(0, 0));
         }
     }
 
@@ -76,7 +76,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidde
 }
 
 template<typename T>
-GptNeoXWeight<T>::~GptNeoXWeight()
+LLaMAWeight<T>::~LLaMAWeight()
 {
     if (is_maintain_buffer == true) {
         for (int i = 0; i < weights_ptr.size(); i++) {
@@ -92,7 +92,7 @@ GptNeoXWeight<T>::~GptNeoXWeight()
 }
 
 template<typename T>
-GptNeoXWeight<T>::GptNeoXWeight(const GptNeoXWeight& other):
+LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
     hidden_units_(other.hidden_units_),
     inter_size_(other.inter_size_),
     vocab_size_(other.vocab_size_),
@@ -137,7 +137,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const GptNeoXWeight& other):
 }
 
 template<typename T>
-GptNeoXWeight<T>& GptNeoXWeight<T>::operator=(const GptNeoXWeight& other)
+LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
 {
     hidden_units_               = other.hidden_units_;
     inter_size_                 = other.inter_size_;
@@ -184,7 +184,7 @@ GptNeoXWeight<T>& GptNeoXWeight<T>::operator=(const GptNeoXWeight& other)
 }
 
 template<typename T>
-void GptNeoXWeight<T>::setWeightPtr()
+void LLaMAWeight<T>::setWeightPtr()
 {
     prompt_learning_table.resize(prompt_learning_pair_.size());
 
@@ -207,7 +207,7 @@ void GptNeoXWeight<T>::setWeightPtr()
 }
 
 template<typename T>
-void GptNeoXWeight<T>::mallocWeights()
+void LLaMAWeight<T>::mallocWeights()
 {
     weights_ptr.resize(num_base_weights + prompt_learning_pair_.size());
 
@@ -233,9 +233,9 @@ void GptNeoXWeight<T>::mallocWeights()
 }
 
 template<typename T>
-void GptNeoXWeight<T>::loadModel(std::string dir_path)
+void LLaMAWeight<T>::loadModel(std::string dir_path)
 {
-    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "gptneox");
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama");
     FT_CHECK(is_maintain_buffer == true);
 
     loadWeightFromBin<T>(
@@ -279,24 +279,24 @@ void GptNeoXWeight<T>::loadModel(std::string dir_path)
 }
 
 template<typename T>
-void GptNeoXWeight<T>::resizeLayer(const int num_layer)
+void LLaMAWeight<T>::resizeLayer(const int num_layer)
 {
     num_layer_ = num_layer;
     decoder_layer_weights.reserve(num_layer_);
     for (int l = 0; l < num_layer_; l++) {
-        decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>());
+        decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>());
     }
 }
 
 template<typename T>
-bool GptNeoXWeight<T>::isValidLayerParallelId(int l)
+bool LLaMAWeight<T>::isValidLayerParallelId(int l)
 {
     int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_));
     return l < num_layer_ && (l >= local_num_layer * layer_para_rank_)
            && (l < local_num_layer * (layer_para_rank_ + 1));
 }
 
-template struct GptNeoXWeight<float>;
-template struct GptNeoXWeight<half>;
+template struct LLaMAWeight<float>;
+template struct LLaMAWeight<half>;
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
index 3e868854e..dd602c107 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.h
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -17,17 +17,17 @@
 #pragma once
 
 #include "src/fastertransformer/kernels/layernorm_kernels.h"
-#include "src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
 #include "src/fastertransformer/utils/memory_utils.h"
 #include "src/fastertransformer/utils/prompt_learning.h"
 
 namespace fastertransformer {
 
 template<typename T>
-struct GptNeoXWeight {
+struct LLaMAWeight {
 
-    GptNeoXWeight() = default;
-    GptNeoXWeight(
+    LLaMAWeight() = default;
+    LLaMAWeight(
         const int                                  hidden_units,
         const int                                  inter_size,
         const int                                  vocab_size,
@@ -41,18 +41,18 @@ struct GptNeoXWeight {
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
-    ~GptNeoXWeight();
-    GptNeoXWeight(const GptNeoXWeight& other);
-    GptNeoXWeight& operator=(const GptNeoXWeight& other);
+    ~LLaMAWeight();
+    LLaMAWeight(const LLaMAWeight& other);
+    LLaMAWeight& operator=(const LLaMAWeight& other);
 
     void loadModel(std::string dir_path);
 
     void resizeLayer(const int num_layer);
 
-    std::vector<GptNeoXDecoderLayerWeight<T>*> decoder_layer_weights;
+    std::vector<LLaMADecoderLayerWeight<T>*> decoder_layer_weights;
     const T*                                   pre_decoder_embedding_table = nullptr;
     // GPT-J does not use embedding table, but we leave the ptr such that
-    // GptNeoX::forward and Gpt::forward become identical
+    // LLaMA::forward and Gpt::forward become identical
     const T* position_encoding_table = nullptr;
 
     /*

From a590e948eca2169b98008f9f1837b729838a295a Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 12 Sep 2023 08:31:13 +0000
Subject: [PATCH 04/55] dump

---
 examples/cpp/CMakeLists.txt                   |   1 +
 src/fastertransformer/models/llama/LLaMA.cc   | 171 ++----------------
 src/fastertransformer/models/llama/LLaMA.h    |  16 --
 .../models/llama/LLaMAWeight.cc               | 103 +----------
 .../models/llama/LLaMAWeight.h                |  16 +-
 src/fastertransformer/th_op/llama/LLaMA.cc    |   8 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |  44 ++---
 7 files changed, 52 insertions(+), 307 deletions(-)

diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index da24d72c6..38ae86412 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(wenet)
 
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
+add_subdirectory(llama)
 add_subdirectory(multi_gpu_gpt)
 
 if(ENABLE_FP8)
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 4ac26a473..575636fb4 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -214,16 +214,8 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              rotary_embedding_dim,
                     int                                 start_id,
                     int                                 end_id,
-                    int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
-                    PromptLearningType                  prompt_learning_type,
                     bool                                use_gptj_residual,
-                    float                               beam_search_diversity_rate,
-                    size_t                              top_k,
-                    float                               top_p,
                     unsigned long long                  random_seed,
-                    float                               temperature,
-                    float                               len_penalty,
-                    float                               repetition_penalty,
                     cudaStream_t                        stream,
                     cublasMMWrapper*                    cublas_wrapper,
                     IAllocator*                         allocator,
@@ -241,8 +233,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     start_id_(start_id),
     end_id_(end_id),
-    prompt_learning_start_id_(prompt_learning_start_id),
-    prompt_learning_type_(prompt_learning_type),
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
@@ -270,16 +260,8 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              rotary_embedding_dim,
                     int                                 start_id,
                     int                                 end_id,
-                    int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
-                    PromptLearningType                  prompt_learning_type,
                     bool                                use_gptj_residual,
-                    float                               beam_search_diversity_rate,
-                    size_t                              top_k,
-                    float                               top_p,
                     unsigned long long                  random_seed,
-                    float                               temperature,
-                    float                               len_penalty,
-                    float                               repetition_penalty,
                     NcclParam                           tensor_para,
                     NcclParam                           pipeline_para,
                     cudaStream_t                        stream,
@@ -299,8 +281,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     start_id_(start_id),
     end_id_(end_id),
-    prompt_learning_start_id_(prompt_learning_start_id),
-    prompt_learning_type_(prompt_learning_type),
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     tensor_para_(tensor_para),
@@ -330,7 +310,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     start_id_(llama.start_id_),
     end_id_(llama.end_id_),
     prompt_learning_start_id_(llama.prompt_learning_start_id_),
-    prompt_learning_type_(llama.prompt_learning_type_),
     use_gptj_residual_(llama.use_gptj_residual_),
     hidden_units_(llama.hidden_units_),
     tensor_para_(llama.tensor_para_),
@@ -383,26 +362,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // input_tensors:
     //      input_ids [batch_size, max_input_length]
     //      input_lengths [batch_size]
-    //      prompt_learning_task_name_ids [batch_size] on cpu, optional
     //      output_seq_len [batch_size] on cpu
     //      start_id [batch_size] on cpu, optional
     //      end_id [batch_size] on cpu, optional
     //      stop_words_list [batch_size, 2, stop_words_length], optional
     //      bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
-    //      runtime_top_k [1] or [batch_size] on cpu, optional, uint.
-    //      runtime_top_p [1] or [batch_size] on cpu, optional, float.
-    //      beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float.
-    //      temperature [1] or [batch_size] on cpu, optional, float.
-    //      len_penalty [1] or [batch_size] on cpu, optional, float.
-    //      repetition_penalty [1] or [batch_size] on cpu, optional, float.
     //      min_length [1] or [batch_size] on cpu, optional, int
     //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
-    //      request_prompt_lengths [batch_size], optional
-    //      request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional
-    //      requst_prompt_type [batch_size], int, optional
-    //      top_p_decay [batch_size] on gpu, float, optional
-    //      top_p_min [batch_size] on gpu, float, optional
-    //      top_p_reset_ids [batch_size] on gpu, uint32, optional
 
     // output_tensors:
     //      output_ids [batch_size, beam_width, max_output_seq_len]
@@ -432,83 +398,28 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const size_t batch_size = output_tensors->at("output_ids").shape[0];
     const size_t beam_width = output_tensors->at("output_ids").shape[1];
 
-    PromptLearningType request_prompt_type = PromptLearningType::no_prompt;
-    int                valid_prompt_inputs = input_tensors->count("request_prompt_type")
-                              + input_tensors->count("request_prompt_lengths")
-                              + input_tensors->count("request_prompt_embedding");
-
-    if (valid_prompt_inputs == 3) {
-        request_prompt_type = static_cast<PromptLearningType>(input_tensors->at("request_prompt_type").getVal<int>());
-        FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids");
-    }
-    else if (valid_prompt_inputs > 0) {
-        FT_LOG_WARNING(
-            "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!");
-    }
-    if (request_prompt_type == PromptLearningType::prefix_prompt) {
-        FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!");
-    }
 
     // Prefix Prompt Inputs
     // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes
     // pad)
     // TODO (perkzz): move unnecessary paddings
-    const int* prompt_learning_task_name_ids =
-        input_tensors->count("prompt_learning_task_name_ids") ?
-            input_tensors->at("prompt_learning_task_name_ids").getPtr<const int>() :
-            nullptr;
-    has_prefix_prompt_ =
-        (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt);
+    has_prefix_prompt_ = false;
     int max_prefix_prompt_length = 0;
 
-    FT_CHECK_WITH_INFO(
-        !(prompt_learning_task_name_ids != nullptr
-          && (prompt_learning_type_ == PromptLearningType::no_prompt
-              || prompt_learning_type_ == PromptLearningType::soft_prompt)),
-        "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided.");
-
     // NOTE: Prefix Prompt PreProcessing
     // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
     // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
     std::vector<const T*> prefix_prompt_weight_batch_ptrs;
     std::vector<int>      prefix_prompt_lengths;
-    if (has_prefix_prompt_) {
-        for (int bs_id = 0; bs_id < batch_size; ++bs_id) {
-            int task_id = prompt_learning_task_name_ids[bs_id];
-            // throw errors when prompt task_name_ids are not found
-            std::pair<const T*, int> prefix_prompt_weight_length_pair;
-            try {
-                prefix_prompt_weight_length_pair = llama_weights->prompt_learning_table.at(task_id);
-            }
-            catch (const std::out_of_range& oor) {
-                FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id);
-                throw oor;
-            }
-            for (int bw_id = 0; bw_id < beam_width; ++bw_id) {
-                prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first);
-                prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second);
-            }
-        }
-
-        max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end());
-
-        FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length);
-
-        if (max_prefix_prompt_length == 0) {
-            has_prefix_prompt_ = false;
-            FT_LOG_DEBUG("prompts are not applied !");
-        }
-    }
 
     int max_input_length = input_tensors->at("input_ids").shape[1];
     FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0),
                        "Prefix Prompt should come with inputs!");
 
     // Prefix Soft Prompt
-    has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt;
-    const size_t max_prefix_soft_prompt_length =
-        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
-    const size_t limit_len_offset   = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0);
+    has_prefix_soft_prompt_ = false;
+    const size_t max_prefix_soft_prompt_length = 0;
+    const size_t limit_len_offset   = 0 + (max_input_length == 0 ? 1 : 0);
     const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>() + limit_len_offset;
     const size_t max_seq_len        = max_output_seq_len;
     // max cache seq len should include max prefix prompt length as it has k/v states
@@ -527,7 +438,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
     allocateBuffer(
-        batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length);
+        batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + 0);
     setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
 
     sync_check_cuda_error();
@@ -562,23 +473,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 
     // Prefix prompts
-    if (has_prefix_prompt_) {
-        cudaMemcpyAsync(prompt_learning_weight_batch_,
-                        prefix_prompt_weight_batch_ptrs.data(),
-                        sizeof(T*) * batch_size * beam_width,
-                        cudaMemcpyDefault,
-                        stream_);
-        cudaMemcpyAsync(tiled_prompt_lengths_buf_,
-                        prefix_prompt_lengths.data(),
-                        sizeof(int) * batch_size * beam_width,
-                        cudaMemcpyDefault,
-                        stream_);
-    }
 
     sync_check_cuda_error();
 
     // handle first step
-    if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) {
+    if (max_input_length > 1) {
         invokeTileGptInputs(tiled_input_ids_buf_,
                             tiled_input_lengths_buf_,
                             input_tensors->at("input_ids").getPtr<int>(),
@@ -589,43 +488,19 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             stream_);
         sync_check_cuda_error();
 
-        if (has_prefix_soft_prompt_) {
-            inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param;
-            param.from_tensor                   = context_decoder_input_buf_;
-            param.output_ids                    = output_ids_buf_;
-            param.input_lengths                 = tiled_input_lengths_buf_;
-            param.embedding_table               = llama_weights->pre_decoder_embedding_table;
-            param.pos_table                     = llama_weights->position_encoding_table;
-            param.prefix_soft_prompt_embedding  = input_tensors->at("request_prompt_embedding").getPtr<float>();
-            param.prefix_soft_prompt_lengths    = input_tensors->at("request_prompt_lengths").getPtr<int>();
-            param.input_ids                     = tiled_input_ids_buf_;
-            param.start_step                    = 1;
-            param.max_input_length              = max_input_length;
-            param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length;
-            param.batch_size                    = batch_size;
-            param.beam_width                    = beam_width;
-            param.hidden_units                  = hidden_units_;
-            param.stream                        = stream_;
-
-            invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param);
-            sync_check_cuda_error();
-            max_input_length += max_prefix_soft_prompt_length;  // view soft_prompt as input
-        }
-        else {
-            invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                                                     output_ids_buf_,
-                                                     llama_weights->pre_decoder_embedding_table,
-                                                     llama_weights->position_encoding_table,
-                                                     pPromptTuningParam<T>{},  // no p/prompt tuning
-                                                     tiled_input_ids_buf_,
-                                                     1,
-                                                     max_input_length,
-                                                     max_input_length,
-                                                     batch_size * beam_width,
-                                                     hidden_units_,
-                                                     stream_);
-            sync_check_cuda_error();
-        }
+        invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
+                output_ids_buf_,
+                llama_weights->pre_decoder_embedding_table,
+                llama_weights->position_encoding_table,
+                pPromptTuningParam<T>{},  // no p/prompt tuning
+                tiled_input_ids_buf_,
+                1,
+                max_input_length,
+                max_input_length,
+                batch_size * beam_width,
+                hidden_units_,
+                stream_);
+        sync_check_cuda_error();
 
         invokeBuildDecoderAttentionMask(input_attention_mask_,
                                         tiled_input_lengths_buf_,
@@ -688,8 +563,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
     else if (max_input_length == 0) {
-        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
-                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
         max_input_length++;
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
@@ -709,8 +582,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
     else if (max_input_length == 1) {
-        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
-                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
                                  nullptr,
@@ -1093,8 +964,6 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
     const size_t batch_size       = output_tensors->at("output_ids").shape[0];
     const size_t beam_width       = output_tensors->at("output_ids").shape[1];
     uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
-    const size_t max_prefix_soft_prompt_length =
-        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
 
     if (input_tensors->at("input_ids").shape[1] == 0) {
         invokeCudaD2DcpyConvert(
@@ -1149,7 +1018,7 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         param.prefix_soft_prompt_lengths =
             has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr<int>() : nullptr;
         param.input_lengths                   = tiled_input_lengths_buf_;
-        param.max_prefix_soft_prompt_length   = max_prefix_soft_prompt_length;
+        param.max_prefix_soft_prompt_length   = 0;
         param.max_input_without_prompt_length = max_input_length;
         param.stream                          = stream_;
         param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 5cf7b0025..48506f529 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -145,16 +145,8 @@ class LLaMA: public BaseLayer {
             size_t                              rotary_embedding_dim,
             int                                 start_id,
             int                                 end_id,
-            int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
-            PromptLearningType                  prompt_learning_type,
             bool                                use_gptj_residual,
-            float                               beam_search_diversity_rate,
-            size_t                              top_k,
-            float                               top_p,
             unsigned long long                  random_seed,
-            float                               temperature,
-            float                               len_penalty,
-            float                               repetition_penalty,
             cudaStream_t                        stream,
             cublasMMWrapper*                    cublas_wrapper,
             IAllocator*                         allocator,
@@ -172,16 +164,8 @@ class LLaMA: public BaseLayer {
             size_t                              rotary_embedding_dim,
             int                                 start_id,
             int                                 end_id,
-            int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
-            PromptLearningType                  prompt_learning_type,
             bool                                use_gptj_residual,
-            float                               beam_search_diversity_rate,
-            size_t                              top_k,
-            float                               top_p,
             unsigned long long                  random_seed,
-            float                               temperature,
-            float                               len_penalty,
-            float                               repetition_penalty,
             NcclParam                           tensor_para,
             NcclParam                           pipeline_para,
             cudaStream_t                        stream,
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index cc8c5ab25..dddf6eff6 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -28,9 +28,7 @@ LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_un
                                 const int                                  tensor_para_rank,
                                 const int                                  layer_para_size,
                                 const int                                  layer_para_rank,
-                                const bool                                 use_gptj_residual,
-                                PromptLearningType                         prompt_learning_type,
-                                std::map<std::string, std::pair<int, int>> prompt_learning_pair):
+                                const bool                                 use_gptj_residual):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     vocab_size_(vocab_size),
@@ -40,23 +38,10 @@ LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_un
     tensor_para_rank_(tensor_para_rank),
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
-    use_gptj_residual_(use_gptj_residual),
-    prompt_learning_type_(prompt_learning_type),
-    prompt_learning_pair_(prompt_learning_pair)
+    use_gptj_residual_(use_gptj_residual)
 {
     FT_CHECK(num_layer_ % layer_para_size_ == 0);
-    // set prompt weight size
-    if (prompt_learning_type_ == PromptLearningType::prefix_prompt) {
-        prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_;
-    }
-    else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) {
-        prompt_token_weight_size_ = hidden_units_;
-    }
 
-    // set if load and malloc prompt weights
-    malloc_load_prompt_weights_ = !prompt_learning_pair_.empty()
-                                  && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning
-                                      || prompt_learning_type_ == PromptLearningType::prefix_prompt);
 
     decoder_layer_weights.reserve(num_layer_);
     for (int l = 0; l < num_layer_; l++) {
@@ -103,10 +88,7 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     use_gptj_residual_(other.use_gptj_residual_),
-    prompt_token_weight_size_(other.prompt_token_weight_size_),
-    malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
-    prompt_learning_type_(other.prompt_learning_type_),
-    prompt_learning_pair_(other.prompt_learning_pair_)
+    prompt_token_weight_size_(other.prompt_token_weight_size_)
 {
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
@@ -115,18 +97,6 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
     // prompt learning table: malloc weights and set weight ptr
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            std::string task_name     = prompt.first;
-            int         task_name_id  = prompt.second.first;
-            int         prompt_length = prompt.second.second;
-            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
-
-            // cuda device to device memcpy prompt table weights buffer memory
-            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
-        }
-    }
-
     setWeightPtr();
 
     decoder_layer_weights.clear();
@@ -150,9 +120,6 @@ LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
     layer_para_rank_            = other.layer_para_rank_;
     use_gptj_residual_          = other.use_gptj_residual_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
-    malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
-    prompt_learning_type_       = other.prompt_learning_type_;
-    prompt_learning_pair_       = other.prompt_learning_pair_;
 
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
@@ -160,19 +127,6 @@ LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
-    // prompt learning table: malloc weights and set weight ptr
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            std::string task_name     = prompt.first;
-            int         task_name_id  = prompt.second.first;
-            int         prompt_length = prompt.second.second;
-            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
-
-            // cuda device to device memcpy prompt table weights buffer memory
-            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
-        }
-    }
-
     setWeightPtr();
 
     decoder_layer_weights.clear();
@@ -186,49 +140,22 @@ LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
 template<typename T>
 void LLaMAWeight<T>::setWeightPtr()
 {
-    prompt_learning_table.resize(prompt_learning_pair_.size());
-
     pre_decoder_embedding_table   = weights_ptr[0];
     post_decoder_layernorm.beta   = weights_ptr[1];
     post_decoder_layernorm.gamma  = weights_ptr[2];
     post_decoder_embedding.kernel = weights_ptr[3];
-
-    // prompt learning tables: set weight ptr
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            int    task_name_id   = prompt.second.first;
-            int    prompt_length  = prompt.second.second;
-            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
-
-            // set weight ptr
-            prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length};
-        }
-    }
 }
 
 template<typename T>
 void LLaMAWeight<T>::mallocWeights()
 {
-    weights_ptr.resize(num_base_weights + prompt_learning_pair_.size());
+    weights_ptr.resize(num_base_weights);
 
     deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
     deviceMalloc(&weights_ptr[2], hidden_units_);
     deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
 
-    // prompt learning tables: malloc weights
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            int    task_name_id   = prompt.second.first;
-            int    prompt_length  = prompt.second.second;
-            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
-
-            // malloc weights
-            T* prompt_weights_ptr = nullptr;
-            deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_);
-            weights_ptr[task_weight_id] = prompt_weights_ptr;
-        }
-    }
     is_maintain_buffer = true;
 }
 
@@ -249,28 +176,6 @@ void LLaMAWeight<T>::loadModel(std::string dir_path)
                          dir_path + "/model.lm_head.weight.bin",
                          model_file_type);
 
-    // prompt table: load weights from bin
-    if (malloc_load_prompt_weights_) {
-        for (auto const& prompt : prompt_learning_pair_) {
-            std::string task_name      = prompt.first;
-            int         task_name_id   = prompt.second.first;
-            int         prompt_length  = prompt.second.second;
-            size_t      task_weight_id = num_base_weights + (size_t)task_name_id;
-
-            std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ?
-                                                      (dir_path + "/model.prompt_table." + task_name + ".weight.bin") :
-                                                      (dir_path + "/model.prefix_prompt." + task_name + ".weight."
-                                                       + std::to_string(tensor_para_rank_) + ".bin");
-
-            if (prompt_length > 0) {
-                loadWeightFromBin<T>(weights_ptr[task_weight_id],
-                                     {(size_t)(prompt_length * (int)prompt_token_weight_size_)},
-                                     prompt_weight_path_name,
-                                     model_file_type);
-            }
-        }
-    }
-
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type);
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
index dd602c107..5f3c071e6 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.h
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -37,9 +37,7 @@ struct LLaMAWeight {
         const int                                  tensor_para_rank     = 0,
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
-        const bool                                 use_gptj_residual_   = true,
-        PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
-        std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
+        const bool                                 use_gptj_residual_   = true);
 
     ~LLaMAWeight();
     LLaMAWeight(const LLaMAWeight& other);
@@ -55,15 +53,6 @@ struct LLaMAWeight {
     // LLaMA::forward and Gpt::forward become identical
     const T* position_encoding_table = nullptr;
 
-    /*
-        prompt_learning_pair = vectors of [weight ptr, prompt length] pair
-        prompt_length is stored here for compatible prompt learning table
-        prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
-        p/prompt tuning weights store as shape [prompt_len, hidden_units]
-        idx is the task_name_id of the prompt tables
-    */
-    std::vector<std::pair<const T*, int>> prompt_learning_table = {};
-
     LayerNormWeight<T> post_decoder_layernorm;
     DenseWeight<T>     post_decoder_embedding;
 
@@ -92,9 +81,6 @@ struct LLaMAWeight {
     bool use_gptj_residual_;
 
     // prompt learning pair (task_name, (task_name_id, prompt_len))
-    PromptLearningType                         prompt_learning_type_;
-    std::map<std::string, std::pair<int, int>> prompt_learning_pair_;
-    bool                                       malloc_load_prompt_weights_ = false;
     // each prompt token's weight size
     size_t prompt_token_weight_size_ = 0;
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index e913570cd..08449b679 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -41,7 +41,7 @@ LLaMA::LLaMA(const int64_t            head_num,
 
     switch (st_) {
         case at::ScalarType::Float:
-            ftgpt = new FTGptNeoX<float>((size_t)head_num,
+            ftllama = new FTLLaMA<float>((size_t)head_num,
                                          (size_t)size_per_head,
                                          (size_t)inter_size,
                                          (size_t)layer_num,
@@ -56,7 +56,7 @@ LLaMA::LLaMA(const int64_t            head_num,
                                          weights);
             break;
         case at::ScalarType::Half:
-            ftgpt = new FTGptNeoX<half>((size_t)head_num,
+            ftllama = new FTLLaMA<half>((size_t)head_num,
                                         (size_t)size_per_head,
                                         (size_t)inter_size,
                                         (size_t)layer_num,
@@ -77,7 +77,7 @@ LLaMA::LLaMA(const int64_t            head_num,
 
 LLaMA::~LLaMA()
 {
-    delete ftgpt;
+    delete ftllama;
 }
 
 std::vector<th::Tensor> LLaMA::forward(th::Tensor               input_ids,
@@ -119,7 +119,7 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor               input_ids,
     th::Tensor cum_log_probs =
         torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
 
-    ftgpt->forward(input_ids,
+    ftllama->forward(input_ids,
                    input_lengths,
                    output_ids,
                    sequence_lengths,
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 3cca0bb19..1aac8a7d7 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -81,40 +81,40 @@ class FTLLaMA: public IFLLaMA {
 
         ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size);
 
-        gpt_weights_.resizeLayer(layer_num_);
+        llama_weights_.resizeLayer(layer_num_);
         for (int i = 0; i < (int)layer_num_; i++) {
-            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
+            llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
                 get_ptr<T>(weights_[i + 0 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
+            llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
                 get_ptr<T>(weights_[i + 1 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
                 get_ptr<T>(weights_[i + 2 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
                 get_ptr<T>(weights_[i + 3 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
                 get_ptr<T>(weights_[i + 4 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
                 get_ptr<T>(weights_[i + 5 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
                 get_ptr<T>(weights_[i + 6 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
                 get_ptr<T>(weights_[i + 7 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
                 get_ptr<T>(weights_[i + 8 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
                 get_ptr<T>(weights_[i + 9 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
+            llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
                 get_ptr<T>(weights_[i + 10 * layer_num_]);
-            gpt_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
+            llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
                 get_ptr<T>(weights_[i + 11 * layer_num_]);
         }
 
-        gpt_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[12 * layer_num_ + 0]);
-        gpt_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[12 * layer_num_ + 1]);
-        gpt_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
-        gpt_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
+        llama_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[12 * layer_num_ + 0]);
+        llama_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[12 * layer_num_ + 1]);
+        llama_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
+        llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
 
-        gpt_weights_.setMaxSeqLen(max_seq_len);
+        llama_weights_.setMaxSeqLen(max_seq_len);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
     }
@@ -172,7 +172,7 @@ class FTLLaMA: public IFLLaMA {
                                                                    false,  // with_relative_position_bias
                                                                    true);  // causal_mask
 
-        ft::LLaMA<T> gpt = ft::LLaMA<T>(head_num_,
+        ft::LLaMA<T> llama = ft::LLaMA<T>(head_num_,
                                             size_per_head_,
                                             inter_size_,
                                             layer_num_,
@@ -267,7 +267,7 @@ class FTLLaMA: public IFLLaMA {
         }
 
         try {
-            gpt.forward(&output_tensors, &input_tensors, &gpt_weights_);
+            llama.forward(&output_tensors, &input_tensors, &llama_weights_);
         }
         catch (std::runtime_error& error) {
             std::cout << error.what();
@@ -297,7 +297,7 @@ class FTLLaMA: public IFLLaMA {
     std::mutex*             cublas_wrapper_mutex_;
     ft::cublasAlgoMap*      cublas_algo_map_;
     struct cudaDeviceProp   prop_;
-    ft::LLaMAWeight<T>    gpt_weights_;
+    ft::LLaMAWeight<T>    llama_weights_;
 
     ft::NcclParam tensor_para_;
     ft::NcclParam pipeline_para_;
@@ -339,7 +339,7 @@ class LLaMA: public th::jit::CustomClassHolder {
 
 private:
     const at::ScalarType    st_;
-    IFLLaMA*              ftgpt;
+    IFLLaMA*              ftllama;
     std::vector<th::Tensor> weights;
 };
 

From a763d188fc202c2b0e0c64a8d9ec72d279815a3d Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 12 Sep 2023 11:46:03 +0000
Subject: [PATCH 05/55] add examples

---
 examples/cpp/llama/CMakeLists.txt   |  18 ++
 examples/cpp/llama/bad_words.csv    |   2 +
 examples/cpp/llama/llama_config.ini |  23 ++
 examples/cpp/llama/llama_example.cc | 403 ++++++++++++++++++++++++++++
 examples/cpp/llama/start_ids.csv    |   8 +
 examples/cpp/llama/stop_words.csv   |   2 +
 6 files changed, 456 insertions(+)
 create mode 100644 examples/cpp/llama/CMakeLists.txt
 create mode 100644 examples/cpp/llama/bad_words.csv
 create mode 100644 examples/cpp/llama/llama_config.ini
 create mode 100644 examples/cpp/llama/llama_example.cc
 create mode 100644 examples/cpp/llama/start_ids.csv
 create mode 100644 examples/cpp/llama/stop_words.csv

diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
new file mode 100644
index 000000000..ce0bee75f
--- /dev/null
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(llama_example llama_example.cc)
+target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
+    LLaMA mpi_utils nccl_utils nvtx_utils 
+    gpt_example_utils word_list)
diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv
new file mode 100644
index 000000000..6a1126ebd
--- /dev/null
+++ b/examples/cpp/llama/bad_words.csv
@@ -0,0 +1,2 @@
+7768,3908
+1,2
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
new file mode 100644
index 000000000..58874bdc2
--- /dev/null
+++ b/examples/cpp/llama/llama_config.ini
@@ -0,0 +1,23 @@
+[ft_instance_hyperparameter]
+model_name=llama_33B
+model_dir=../models/llam
+data_type=fp16
+pipeline_para_size=4
+
+
+[request]
+beam_width=1 # beam width for beam search
+request_batch_size=8 # determine by the request
+request_output_len=0 # determine by the request
+
+[llama_33B]
+head_num=52
+size_per_head=128
+vocab_size=32000
+decoder_layers=60
+rotary_embedding=128
+multiple_of=256
+start_id=0
+end_id=2
+
+use_gptj_residual=1
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
new file mode 100644
index 000000000..699e39154
--- /dev/null
+++ b/examples/cpp/llama/llama_example.cc
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMA.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+#include "3rdparty/INIReader.h"
+
+// Remove LATER
+#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
+
+#include <cuda_profiler_api.h>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <sys/time.h>
+#include <vector>
+
+using namespace fastertransformer;
+
+template<typename T>
+void llama_example(const INIReader reader);
+
+int main(int argc, char* argv[])
+{
+    mpi::initialize(&argc, &argv);
+    srand(0);
+
+    std::string ini_name;
+    if (argc == 2) {
+        ini_name = std::string(argv[1]);
+    }
+    else {
+        ini_name = "../examples/cpp/llama/llama_config.ini";
+    }
+
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        return -1;
+    }
+    const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
+
+    if (data_type == "fp32") {
+        llama_example<float>(reader);
+    }
+    else if (data_type == "fp16") {
+        llama_example<half>(reader);
+    }
+    else {
+        FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
+        return -1;
+    }
+    mpi::finalize();
+    return 0;
+}
+
+template<typename T>
+void llama_example(const INIReader reader)
+{
+    const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name");
+    std::string       model_dir  = std::string(reader.Get("ft_instance_hyperparameter", "model_dir"));
+    int pipeline_para_size       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+
+    const size_t head_num             = reader.GetInteger(model_name, "head_num");
+    const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
+    const size_t vocab_size           = reader.GetInteger(model_name, "vocab_size");
+    const size_t decoder_layers       = reader.GetInteger(model_name, "decoder_layers");
+    const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
+    const int    multiple_of          = reader.GetInteger(model_name, "multiple_of");
+    const int    start_id             = reader.GetInteger(model_name, "start_id");
+    const int    end_id               = reader.GetInteger(model_name, "end_id");
+
+    const size_t hidden_units = head_num * size_per_head;
+    const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of -1) / multiple_of);
+
+    const size_t beam_width         = reader.GetInteger("request", "beam_width");
+    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+    const int    request_output_len = reader.GetInteger("request", "request_output_len");
+    const int    min_length         = reader.GetInteger("request", "min_length", 0);
+
+    FT_CHECK(decoder_layers % pipeline_para_size == 0);
+
+    // Prepare the parallelism parameters
+    int rank       = mpi::getCommWorldRank();
+    int world_size = mpi::getCommWorldSize();
+    if (rank == 0) {
+        printf("Total ranks: %d.\n", world_size);
+    }
+    int device, device_count;
+    check_cuda_error(cudaGetDeviceCount(&device_count));
+    check_cuda_error(cudaSetDevice(rank % device_count));
+    check_cuda_error(cudaGetDevice(&device));
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, device));
+    printf("Device %s\n", prop.name);
+
+    printf("P%d is running with GPU #%d.\n", rank, device);
+    if (pipeline_para_size != world_size) {
+        printf("[ERROR] pipeline_para_size should equal to world_size \n");
+        exit(-1);
+    }
+
+    const int layers_per_group = decoder_layers / pipeline_para_size;
+    if (layers_per_group * pipeline_para_size != (int)decoder_layers) {
+        printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n",
+               layers_per_group,
+               pipeline_para_size,
+               decoder_layers);
+        exit(-1);
+    }
+
+    NcclParam tensor_para;
+    NcclParam pipeline_para;
+    ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size);
+
+    // Handle bad_words dictionary
+    std::vector<int> bad_words;
+    read_word_list("../examples/cpp/llama/bad_words.csv", bad_words);
+
+    int* d_bad_words = nullptr;
+    deviceMalloc(&d_bad_words, bad_words.size(), false);
+    cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size());
+
+    // Handle stop_words dictionary
+    std::vector<int> stop_words;
+    read_word_list("../examples/cpp/llama/stop_words.csv", stop_words);
+
+    const size_t stop_words_len = stop_words.size() / 2;
+    // Tile with same dict for each element
+    std::vector<int> tiled_stop_words;
+    for (int i = 0; i < request_batch_size; i++) {
+        tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
+    }
+
+    int* d_stop_words = nullptr;
+    deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
+    cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
+
+    // Read ids of request from file.
+    size_t           max_input_len = -1;
+    std::vector<int> v_start_lengths;
+    std::vector<int> v_start_ids;
+    read_start_ids(request_batch_size,
+                   &v_start_lengths,
+                   &v_start_ids,
+                   max_input_len,
+                   end_id,
+                   1,
+                   "../examples/cpp/llama/start_ids.csv");
+
+    int* d_input_ids;
+    int* d_input_lengths;
+    if (max_input_len == 0) {
+        // unconditional case, no input ids, so do nothing.
+        d_input_ids     = nullptr;
+        d_input_lengths = nullptr;
+    }
+    else {
+        // conditional case.
+        deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false);
+        deviceMalloc(&d_input_lengths, request_batch_size, false);
+        cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len);
+        cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size);
+    }
+    std::vector<int> start_ids(request_batch_size, start_id);
+    std::vector<int> end_ids(request_batch_size, end_id);
+
+    const int total_output_len = max_input_len + request_output_len;
+
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    cudaStreamCreate(&stream);
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+    cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in");
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper =
+        cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator);
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper.setFP32GemmConfig();
+    }
+
+    // LLAMA Residual Type
+    const bool                          use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1);
+    fastertransformer::LLaMAWeight<T> llama_weights(hidden_units,
+                                                    inter_size,
+                                                    vocab_size,
+                                                    decoder_layers,
+                                                    0,  // max_seq_len, deprecated
+                                                    tensor_para.world_size_,
+                                                    tensor_para.rank_,
+                                                    pipeline_para.world_size_,
+                                                    pipeline_para.rank_,
+                                                    use_gptj_residual);
+
+    model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
+    llama_weights.loadModel(model_dir);
+    unsigned long long random_seed;
+    if (rank == 0) {
+        random_seed = (unsigned long long)(0);
+    }
+    if (world_size > 1) {
+        mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD);
+    }
+
+    AttentionType attention_type = getAttentionType<T>(size_per_head,
+                                                       getSMVersion(),
+                                                       true,   // remove_padding
+                                                       0,      // llama supports any-seq-length fmha
+                                                       true,   // is_fuse
+                                                       false,  // with_relative_position_bias
+                                                       true);  // causal_mask
+
+    LLaMA<T> llama = LLaMA<T>(head_num,
+                                size_per_head,
+                                inter_size,
+                                decoder_layers,
+                                vocab_size,
+                                rotary_embedding_dim,
+                                start_id,
+                                end_id,
+                                use_gptj_residual,
+                                random_seed,
+                                tensor_para,
+                                pipeline_para,
+                                stream,
+                                &cublas_wrapper,
+                                &allocator,
+                                false,
+                                &prop,
+                                attention_type);
+
+    int* d_output_ids;
+    int* d_sequence_lengths;
+    deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
+    deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+    std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
+    std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
+        {"input_ids",
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, (size_t)max_input_len}, d_input_ids}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_input_lengths}},
+        {"output_seq_len",
+         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
+        {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}},
+        {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}},
+        {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
+        {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, start_ids.data()}},
+        {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, end_ids.data()}}};
+
+
+    input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
+
+    std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
+        {"output_ids",
+         Tensor{MEMORY_GPU,
+                TYPE_INT32,
+                std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
+                d_output_ids}},
+        {"sequence_length",
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths}},
+        {"output_log_probs",
+         Tensor{MEMORY_GPU,
+                TYPE_FP32,
+                std::vector<size_t>{(size_t)request_output_len, request_batch_size, beam_width},
+                nullptr}}};
+
+    print_mem_usage();
+
+    int ite = 1;
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    cudaProfilerStart();
+    // warm up
+    ite = 1;
+    ft_nvtx::setScope("warmup_time");
+    PUSH_RANGE("warmup time")
+    for (int i = 0; i < ite; ++i) {
+        llama.forward(&output_tensors, &input_tensors, &llama_weights);
+    }
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+
+    if (rank == 0) {
+
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = total_output_len * request_batch_size * beam_width;
+            int*   hBuf     = new int[outCount];
+            cudaD2Hcpy(hBuf, d_output_ids, outCount);
+
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                for (size_t i = 0; i < outCount; i++) {
+                    if (hBuf[i] == int(0)) {
+                        zeroCount++;
+                    }
+                    outFile << hBuf[i] << " ";
+                    if ((i + 1) % (total_output_len) == 0) {
+                        outFile << std::endl;
+                    }
+
+                    if (i < 10) {
+                        printf("%5d ", hBuf[i]);
+                    }
+                    if ((i + 1) % (total_output_len) == 0 && i < 10) {
+                        std::cout << std::endl;
+                    }
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+            delete[] hBuf;
+        }
+    }
+
+    // test time
+    struct timeval start, end;
+    mpi::barrier();
+    cudaDeviceSynchronize();
+    gettimeofday(&start, NULL);
+
+    ft_nvtx::setScope("total_time");
+    PUSH_RANGE("total time")
+    for (int i = 0; i < ite; ++i) {
+        llama.forward(&output_tensors, &input_tensors, &llama_weights);
+    }
+
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+    gettimeofday(&end, NULL);
+
+    cudaProfilerStop();
+
+    printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d"
+           " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n",
+           request_batch_size,
+           beam_width,
+           head_num,
+           size_per_head,
+           total_output_len,
+           decoder_layers,
+           vocab_size,
+           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+
+    ftNcclParamDestroy(tensor_para);
+    ftNcclParamDestroy(pipeline_para);
+
+    delete cublas_algo_map;
+    delete cublas_wrapper_mutex;
+
+    cudaFree(d_bad_words);
+    cudaFree(d_stop_words);
+    if (d_input_ids != nullptr) {
+        cudaFree(d_input_ids);
+    }
+    if (d_input_lengths != nullptr) {
+        cudaFree(d_input_lengths);
+    }
+    if (d_output_ids != nullptr) {
+        deviceFree(d_output_ids);
+    }
+    if (d_sequence_lengths != nullptr) {
+        deviceFree(d_sequence_lengths);
+    }
+
+    return;
+}
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
new file mode 100644
index 000000000..88e742f39
--- /dev/null
+++ b/examples/cpp/llama/start_ids.csv
@@ -0,0 +1,8 @@
+688, 253, 1390, 4564, 273, 1897, 13, 247
+510, 1457, 8911, 4487, 273, 26593, 310, 6600
+510, 1457, 2816, 28260, 452, 247, 747, 1481
+510, 1457, 2816, 7717, 556, 3863, 697, 7970
+688, 247, 2118, 326, 588, 2779, 1056, 352
+510, 1457, 2816, 28260, 8, 13413, 19169, 14745
+510, 9462, 5687, 556, 38350, 26212, 253, 747
+510, 806, 673, 309, 3047, 253, 6440, 13
\ No newline at end of file
diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv
new file mode 100644
index 000000000..9b9b09eba
--- /dev/null
+++ b/examples/cpp/llama/stop_words.csv
@@ -0,0 +1,2 @@
+287, 4346, 12
+3, -1, -1

From 2cb06f11fea98e0bb8dd61ee889870c84b8d966d Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Wed, 13 Sep 2023 13:51:34 +0000
Subject: [PATCH 06/55] llama......

---
 .gitignore                                    |   5 +-
 examples/cpp/llama/llama_config.ini           |   4 +-
 examples/cpp/llama/llama_example.cc           |   7 +-
 src/fastertransformer/models/llama/LLaMA.cc   |  77 +++-------
 src/fastertransformer/models/llama/LLaMA.h    |   5 -
 .../models/llama/LLaMAContextDecoder.cc       | 140 ++++++-----------
 .../models/llama/LLaMAContextDecoder.h        |   4 -
 .../models/llama/LLaMADecoder.cc              | 142 ++++++------------
 .../models/llama/LLaMADecoder.h               |   4 -
 .../models/llama/LLaMADecoderLayerWeight.cc   |  39 ++---
 .../models/llama/LLaMADecoderLayerWeight.h    |   4 +-
 .../models/llama/LLaMAWeight.cc               |  16 +-
 .../models/llama/LLaMAWeight.h                |  15 +-
 13 files changed, 137 insertions(+), 325 deletions(-)

diff --git a/.gitignore b/.gitignore
index 77849f435..5b49d9183 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@ __pycache__/
 **/.ipynb_checkpoints/
 
 /3rdparty/NeMo/
-/3rdparty/apex/
\ No newline at end of file
+/3rdparty/apex/
+20B_checkpoints/
+compile_commands.json
+model/
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 58874bdc2..68f4663d1 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -1,6 +1,6 @@
 [ft_instance_hyperparameter]
 model_name=llama_33B
-model_dir=../models/llam
+model_dir=../models/llama
 data_type=fp16
 pipeline_para_size=4
 
@@ -19,5 +19,3 @@ rotary_embedding=128
 multiple_of=256
 start_id=0
 end_id=2
-
-use_gptj_residual=1
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 699e39154..62919d57a 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -205,18 +205,14 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
-    // LLAMA Residual Type
-    const bool                          use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1);
     fastertransformer::LLaMAWeight<T> llama_weights(hidden_units,
                                                     inter_size,
                                                     vocab_size,
                                                     decoder_layers,
-                                                    0,  // max_seq_len, deprecated
                                                     tensor_para.world_size_,
                                                     tensor_para.rank_,
                                                     pipeline_para.world_size_,
-                                                    pipeline_para.rank_,
-                                                    use_gptj_residual);
+                                                    pipeline_para.rank_);
 
     model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
     llama_weights.loadModel(model_dir);
@@ -244,7 +240,6 @@ void llama_example(const INIReader reader)
                                 rotary_embedding_dim,
                                 start_id,
                                 end_id,
-                                use_gptj_residual,
                                 random_seed,
                                 tensor_para,
                                 pipeline_para,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 575636fb4..3734b63d5 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -16,8 +16,8 @@
 
 #include "src/fastertransformer/models/llama/LLaMA.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/decoding_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
 #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
 #include <algorithm>
 
@@ -32,9 +32,7 @@ void LLaMA<T>::initialize()
                                                         num_layer_,
                                                         rotary_embedding_dim_,
                                                         neox_rotary_style_,
-                                                        use_gptj_residual_,
                                                         layernorm_eps_,
-                                                        tensor_para_,
                                                         pipeline_para_,
                                                         stream_,
                                                         cublas_wrapper_,
@@ -51,9 +49,7 @@ void LLaMA<T>::initialize()
                                          num_layer_,
                                          rotary_embedding_dim_,
                                          neox_rotary_style_,
-                                         use_gptj_residual_,
                                          layernorm_eps_,
-                                         tensor_para_,
                                          pipeline_para_,
                                          stream_,
                                          cublas_wrapper_,
@@ -96,20 +92,16 @@ void LLaMA<T>::allocateBuffer(
             (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true));
     }
 
-    input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
-    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
-    nccl_logits_buf_ =
-        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
-    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
-    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
-    h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+    input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
+    decoder_input_buf_    = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    decoder_output_buf_   = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    logits_buf_           = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    nccl_logits_buf_      = (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    cum_log_probs_        = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
+    finished_buf_         = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
+    h_finished_buf_       = new bool[batchxbeam];
+    sequence_lengths_     = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;
@@ -214,7 +206,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              rotary_embedding_dim,
                     int                                 start_id,
                     int                                 end_id,
-                    bool                                use_gptj_residual,
                     unsigned long long                  random_seed,
                     cudaStream_t                        stream,
                     cublasMMWrapper*                    cublas_wrapper,
@@ -233,7 +224,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     start_id_(start_id),
     end_id_(end_id),
-    use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
     attention_type_(attention_type)
@@ -260,7 +250,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                     size_t                              rotary_embedding_dim,
                     int                                 start_id,
                     int                                 end_id,
-                    bool                                use_gptj_residual,
                     unsigned long long                  random_seed,
                     NcclParam                           tensor_para,
                     NcclParam                           pipeline_para,
@@ -281,7 +270,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     start_id_(start_id),
     end_id_(end_id),
-    use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
@@ -310,7 +298,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     start_id_(llama.start_id_),
     end_id_(llama.end_id_),
     prompt_learning_start_id_(llama.prompt_learning_start_id_),
-    use_gptj_residual_(llama.use_gptj_residual_),
     hidden_units_(llama.hidden_units_),
     tensor_para_(llama.tensor_para_),
     pipeline_para_(llama.pipeline_para_),
@@ -403,27 +390,19 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes
     // pad)
     // TODO (perkzz): move unnecessary paddings
-    has_prefix_prompt_ = false;
     int max_prefix_prompt_length = 0;
 
     // NOTE: Prefix Prompt PreProcessing
     // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
     // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
-    std::vector<const T*> prefix_prompt_weight_batch_ptrs;
-    std::vector<int>      prefix_prompt_lengths;
-
     int max_input_length = input_tensors->at("input_ids").shape[1];
-    FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0),
-                       "Prefix Prompt should come with inputs!");
 
     // Prefix Soft Prompt
-    has_prefix_soft_prompt_ = false;
-    const size_t max_prefix_soft_prompt_length = 0;
-    const size_t limit_len_offset   = 0 + (max_input_length == 0 ? 1 : 0);
+    const size_t limit_len_offset   = (max_input_length == 0 ? 1 : 0);
     const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>() + limit_len_offset;
     const size_t max_seq_len        = max_output_seq_len;
     // max cache seq len should include max prefix prompt length as it has k/v states
-    const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length;
+    const size_t max_cache_seq_len = max_output_seq_len;
     if (max_cache_seq_len < max_seq_len) {
         FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). "
                        "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.",
@@ -437,8 +416,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        max_seq_len);
     }
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
-    allocateBuffer(
-        batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + 0);
+    allocateBuffer(batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length);
     setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
 
     sync_check_cuda_error();
@@ -472,8 +450,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
-    // Prefix prompts
-
     sync_check_cuda_error();
 
     // handle first step
@@ -507,7 +483,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                         tiled_prompt_lengths_buf_,
                                         batch_size * beam_width,
                                         max_input_length,
-                                        max_prefix_prompt_length,
+                                        0,
                                         stream_);
         sync_check_cuda_error();
 
@@ -523,19 +499,19 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                     {batch_size * beam_width,
                      1,
                      (size_t)max_input_length,
-                     (size_t)(max_input_length + max_prefix_prompt_length)},
+                     (size_t)(max_input_length)},
                     input_attention_mask_}},
             {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}},
             {"d_prefix_prompt_batch",
              Tensor{MEMORY_GPU,
                     data_type,
                     {batch_size * beam_width},
-                    has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}},
+                    nullptr}},
             {"d_prefix_prompt_lengths",
              Tensor{MEMORY_GPU,
                     TYPE_INT32,
                     {batch_size * beam_width},
-                    has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
+                    nullptr}}};
 
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
@@ -561,8 +537,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                  max_input_length - 1,
                                  stream_);
         sync_check_cuda_error();
-    }
-    else if (max_input_length == 0) {
+    } else if (max_input_length == 0) {
         max_input_length++;
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
@@ -580,8 +555,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         cudaMemcpyHostToDevice,
                         stream_);
         sync_check_cuda_error();
-    }
-    else if (max_input_length == 1) {
+    } else if (max_input_length == 1) {
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
                                  nullptr,
@@ -630,7 +604,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
                             tiled_prompt_lengths_buf_,
                             max_cache_seq_len,
-                            max_input_length + max_prefix_prompt_length,
+                            max_input_length,
                             0,
                             batch_size,
                             beam_width,
@@ -685,7 +659,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                      Tensor{MEMORY_GPU,
                             TYPE_INT32,
                             {local_batch_size},
-                            has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}},
+                            nullptr}},
                     {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}},
                     {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
                     {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
@@ -902,9 +876,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
              */
             invokeUpdatePaddingCount(tiled_total_padding_count_,
                                      input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr,
+                                     (const int*)nullptr,
                                      max_input_length,
-                                     has_prefix_prompt_ ? max_prefix_prompt_length : 0,
+                                     0,
                                      batch_size,
                                      beam_width,
                                      stream_);
@@ -1015,8 +989,7 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         param.parent_ids                     = beam_width == 1 ? nullptr : parent_ids_buf_;
         param.end_tokens                     = end_ids_buf_;
         param.max_input_length               = max_input_length;
-        param.prefix_soft_prompt_lengths =
-            has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr<int>() : nullptr;
+        param.prefix_soft_prompt_lengths     = nullptr;
         param.input_lengths                   = tiled_input_lengths_buf_;
         param.max_prefix_soft_prompt_length   = 0;
         param.max_input_without_prompt_length = max_input_length;
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 48506f529..2f4f52c7b 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -60,9 +60,6 @@ class LLaMA: public BaseLayer {
         (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr ||
          std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
-    // Residual Type
-    const bool use_gptj_residual_ = true;
-
     // Prompt Learning Parameters
     PromptLearningType prompt_learning_type_;
     int                prompt_learning_start_id_;  // start_id for prompt_learning (only needed by prefix prompts)
@@ -145,7 +142,6 @@ class LLaMA: public BaseLayer {
             size_t                              rotary_embedding_dim,
             int                                 start_id,
             int                                 end_id,
-            bool                                use_gptj_residual,
             unsigned long long                  random_seed,
             cudaStream_t                        stream,
             cublasMMWrapper*                    cublas_wrapper,
@@ -164,7 +160,6 @@ class LLaMA: public BaseLayer {
             size_t                              rotary_embedding_dim,
             int                                 start_id,
             int                                 end_id,
-            bool                                use_gptj_residual,
             unsigned long long                  random_seed,
             NcclParam                           tensor_para,
             NcclParam                           pipeline_para,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 69ed839a3..ecf127ae6 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -18,49 +18,44 @@
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 
-#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
-#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h"
+
 
 namespace fastertransformer {
 
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
-    self_attention_layer_ = new TensorParallelGptContextAttentionLayer<T>(0,  // max_batch_size
-                                                                          0,  // max_seq_len
-                                                                          head_num_,
-                                                                          size_per_head_,
-                                                                          rotary_embedding_dim_,
-                                                                          neox_rotary_style_,
-                                                                          tensor_para_,
-                                                                          stream_,
-                                                                          cublas_wrapper_,
-                                                                          allocator_,
-                                                                          !use_gptj_residual_,
-                                                                          is_free_buffer_after_forward_,
-                                                                          is_qk_buf_float_,
-                                                                          false,
-                                                                          0,
-                                                                          custom_all_reduce_comm_,
-                                                                          enable_custom_all_reduce_);
-
-    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
-                                                   0,  // max_seq_len
-                                                   head_num_,
-                                                   size_per_head_,
-                                                   0,  // expert_num
-                                                   inter_size_,
-                                                   tensor_para_,
-                                                   stream_,
-                                                   cublas_wrapper_,
-                                                   allocator_,
-                                                   !use_gptj_residual_,
-                                                   is_free_buffer_after_forward_,
-                                                   false,
-                                                   0,
-                                                   false,  // use_gated_activation = false;
-                                                   custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+    self_attention_layer_ = new GptContextAttentionLayer<T>(0,  // max_batch_size
+                                                            0,  // max_seq_len
+                                                            head_num_,
+                                                            size_per_head_,
+                                                            head_num_,
+                                                            rotary_embedding_dim_,
+                                                            neox_rotary_style_,
+                                                            stream_,
+                                                            cublas_wrapper_,
+                                                            allocator_,
+                                                            is_free_buffer_after_forward_,
+                                                            is_qk_buf_float_,
+                                                            false,
+                                                            0);
+
+    ffn_layer_ = new GeluFfnLayer<T>(0, // max_batch_size
+                                     1, 
+                                     head_num_,
+                                     size_per_head_,
+                                     0,  // expert_num
+                                     inter_size_,
+                                     stream_,
+                                     cublas_wrapper_,
+                                     allocator_,
+                                     is_free_buffer_after_forward_,
+                                     false,
+                                     0,
+                                     false  // use_gated_activation = false
+                                     );
 }
 
 template<typename T>
@@ -138,9 +133,7 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t
                                                 size_t                              num_layer,
                                                 size_t                              rotary_embedding_dim,
                                                 bool                                neox_rotary_style,
-                                                bool                                use_gptj_residual,
                                                 float                               layernorm_eps,
-                                                NcclParam                           tensor_para,
                                                 NcclParam                           pipeline_para,
                                                 cudaStream_t                        stream,
                                                 cublasMMWrapper*                    cublas_wrapper,
@@ -157,10 +150,8 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t
     num_layer_(num_layer),
     rotary_embedding_dim_(rotary_embedding_dim),
     neox_rotary_style_(neox_rotary_style),
-    use_gptj_residual_(use_gptj_residual),
     layernorm_eps_(layernorm_eps),
     hidden_units_(head_num * size_per_head),
-    tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
@@ -179,10 +170,8 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
     num_layer_(decoder.num_layer_),
     rotary_embedding_dim_(decoder.rotary_embedding_dim_),
     neox_rotary_style_(decoder.neox_rotary_style_),
-    use_gptj_residual_(decoder.use_gptj_residual_),
     layernorm_eps_(decoder.layernorm_eps_),
     hidden_units_(decoder.hidden_units_),
-    tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
@@ -319,15 +308,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
-                int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
-                ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+                int data_size = h_token_num * hidden_units_;
+                ftNcclRecv(layer_input,
                            data_size,
                            pipeline_para_.rank_ - 1,
                            pipeline_para_,
                            stream_);
-                if (tensor_para_.world_size_ > 1) {
-                    ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
-                }
             }
 
             invokeGeneralLayerNorm(decoder_normed_input_,
@@ -396,20 +382,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                            &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
             if (is_final == false) {
-                if (use_gptj_residual_) {
-                    invokeGeneralLayerNorm(decoder_normed_input_,
-                                           layer_input,
-                                           llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                                           llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                                           layernorm_eps_,
-                                           h_token_num,
-                                           hidden_units_,
-                                           (float*)nullptr,
-                                           0,
-                                           stream_);
-                }
-                else {
-                    invokeGeneralAddBiasResidualPreLayerNorm(
+                invokeGeneralAddBiasResidualPreLayerNorm(
                         self_attn_output_,
                         decoder_normed_input_,
                         self_attn_output_,
@@ -426,7 +399,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                         (float*)nullptr,
                         0,
                         stream_);
-                }
 
                 TensorMap ffn_input_tensors(
                     {{"ffn_input",
@@ -435,47 +407,23 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                                Tensor{MEMORY_GPU,
                                                       data_type,
                                                       {h_token_num, (size_t)hidden_units_},
-                                                      use_gptj_residual_ ? ffn_output_ : layer_output}}});
+                                                      layer_output}}});
                 ffn_layer_->forward(
                     &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
-                if (use_gptj_residual_) {
-                    // Original workflow:
-                    //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
-                    // Our workflow:
-                    //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input /
-                    //      TP_size)
-                    // They are equivalent on math, but we can use same buffer for layer_input and layer_output
-
-                    invokeAddBiasAttentionFfnResidual(layer_output,
-                                                      ffn_output_,
-                                                      self_attn_output_,
-                                                      layer_input,
-                                                      llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                                      h_token_num,
-                                                      hidden_units_,
-                                                      tensor_para_.world_size_,
-                                                      stream_);
-                    if (tensor_para_.world_size_ > 1) {
-                        ftNcclAllReduceSum(
-                            layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_);
-                    }
-                }
-                else {
-                    invokeAddBiasResidual(layer_output,
-                                          self_attn_output_,
-                                          llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                          h_token_num,
-                                          hidden_units_,
-                                          stream_);
-                }
+                invokeAddBiasResidual(layer_output,
+                        self_attn_output_,
+                        llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                        h_token_num,
+                        hidden_units_,
+                        stream_);
 
                 sync_check_cuda_error();
 
                 if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
                     && pipeline_para_.world_size_ > 1) {
-                    int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
-                    ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+                    int data_size = h_token_num * hidden_units_;
+                    ftNcclSend(layer_output,
                                data_size,
                                pipeline_para_.rank_ + 1,
                                pipeline_para_,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index b84285f14..c9c474e49 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -42,13 +42,11 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t num_layer_;
     size_t rotary_embedding_dim_;
     bool   neox_rotary_style_;
-    bool   use_gptj_residual_;
     float  layernorm_eps_;
 
     // calculated data
     size_t hidden_units_;
 
-    NcclParam tensor_para_;
     NcclParam pipeline_para_;
 
     std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
@@ -88,9 +86,7 @@ class LLaMAContextDecoder: public BaseLayer {
                           size_t                              num_layer,
                           size_t                              rotary_embedding_dim,
                           bool                                neox_rotary_style,
-                          bool                                use_gptj_residual,
                           float                               layernorm_eps,
-                          NcclParam                           tensor_para,
                           NcclParam                           pipeline_para,
                           cudaStream_t                        stream,
                           cublasMMWrapper*                    cublas_wrapper,
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc
index 3a8fc1458..051744693 100644
--- a/src/fastertransformer/models/llama/LLaMADecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoder.cc
@@ -15,47 +15,43 @@
  */
 
 #include "src/fastertransformer/models/llama/LLaMADecoder.h"
-#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
-#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h"
 
 namespace fastertransformer {
 
 template<typename T>
 void LLaMADecoder<T>::initialize()
 {
-    self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer<T>(0,  // max_batch_size
-                                                                           head_num_,
-                                                                           size_per_head_,
-                                                                           rotary_embedding_dim_,
-                                                                           neox_rotary_style_,
-                                                                           tensor_para_,
-                                                                           stream_,
-                                                                           cublas_wrapper_,
-                                                                           allocator_,
-                                                                           !use_gptj_residual_,
-                                                                           is_free_buffer_after_forward_,
-                                                                           false,
-                                                                           0,
-                                                                           custom_all_reduce_comm_,
-                                                                           enable_custom_all_reduce_);
-
-    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
-                                                   1,
-                                                   head_num_,
-                                                   size_per_head_,
-                                                   0,  // expert_num
-                                                   inter_size_,
-                                                   tensor_para_,
-                                                   stream_,
-                                                   cublas_wrapper_,
-                                                   allocator_,
-                                                   !use_gptj_residual_,
-                                                   is_free_buffer_after_forward_,
-                                                   false,
-                                                   0,
-                                                   false,  // use_gated_activation = false;
-                                                   custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+    self_attention_layer_ = new DecoderSelfAttentionLayer<T>(0,  // max_batch_size
+                                                             head_num_,
+                                                             size_per_head_,
+                                                             head_num_,
+                                                             rotary_embedding_dim_,
+                                                             neox_rotary_style_,
+                                                             head_num_ * size_per_head_,
+                                                             1.0f,
+                                                             stream_,
+                                                             cublas_wrapper_,
+                                                             allocator_,
+                                                             is_free_buffer_after_forward_,
+                                                             false,
+                                                             0);
+
+    ffn_layer_ = new GeluFfnLayer<T>(0, // max_batch_size
+                                     1, 
+                                     head_num_,
+                                     size_per_head_,
+                                     0,  // expert_num
+                                     inter_size_,
+                                     stream_,
+                                     cublas_wrapper_,
+                                     allocator_,
+                                     is_free_buffer_after_forward_,
+                                     false,
+                                     0,
+                                     false  // use_gated_activation = false
+                                     );
 }
 
 template<typename T>
@@ -126,9 +122,7 @@ LLaMADecoder<T>::LLaMADecoder(size_t                              head_num,
                                   size_t                              num_layer,
                                   size_t                              rotary_embedding_dim,
                                   bool                                neox_rotary_style,
-                                  bool                                use_gptj_residual,
                                   float                               layernorm_eps,
-                                  NcclParam                           tensor_para,
                                   NcclParam                           pipeline_para,
                                   cudaStream_t                        stream,
                                   cublasMMWrapper*                    cublas_wrapper,
@@ -143,10 +137,8 @@ LLaMADecoder<T>::LLaMADecoder(size_t                              head_num,
     num_layer_(num_layer),
     rotary_embedding_dim_(rotary_embedding_dim),
     neox_rotary_style_(neox_rotary_style),
-    use_gptj_residual_(use_gptj_residual),
     layernorm_eps_(layernorm_eps),
     hidden_units_(head_num_ * size_per_head),
-    tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
@@ -163,10 +155,8 @@ LLaMADecoder<T>::LLaMADecoder(LLaMADecoder<T> const& decoder):
     num_layer_(decoder.num_layer_),
     rotary_embedding_dim_(decoder.rotary_embedding_dim_),
     neox_rotary_style_(decoder.neox_rotary_style_),
-    use_gptj_residual_(decoder.use_gptj_residual_),
     layernorm_eps_(decoder.layernorm_eps_),
     hidden_units_(decoder.hidden_units_),
-    tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
@@ -247,18 +237,15 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_;
 
         if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
-            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
+            int data_size = local_batch_size * hidden_units_;
             // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
             // stream_);
 
-            ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+            ftNcclRecv(layer_input,
                        data_size,
                        pipeline_para_.rank_ - 1,
                        pipeline_para_,
                        stream_);
-            if (tensor_para_.world_size_ > 1) {
-                ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
-            }
         }
 
         invokeGeneralLayerNorm(decoder_normed_input_,
@@ -293,22 +280,10 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
         self_attention_layer_->forward(&self_attention_output_tensors,
-                                       &self_attention_input_tensors,
-                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
-        if (use_gptj_residual_) {
-            invokeGeneralLayerNorm(decoder_normed_input_,
-                                   layer_input,
-                                   llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                                   llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                                   layernorm_eps_,
-                                   local_batch_size,
-                                   hidden_units_,
-                                   (float*)nullptr,
-                                   0,
-                                   stream_);
-        }
-        else {
-            invokeGeneralAddBiasResidualPreLayerNorm(
+                &self_attention_input_tensors,
+                &llama_decoder_layer_weight->at(l)->self_attention_weights);
+
+        invokeGeneralAddBiasResidualPreLayerNorm(
                 self_attn_output_,
                 decoder_normed_input_,
                 self_attn_output_,
@@ -325,7 +300,6 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 (float*)nullptr,
                 0,
                 stream_);
-        }
 
         TensorMap ffn_input_tensors(
             {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}});
@@ -333,46 +307,22 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                        Tensor{MEMORY_GPU,
                                               data_type,
                                               {local_batch_size, hidden_units_},
-                                              use_gptj_residual_ ? ffn_output_ : layer_output}}});
+                                              layer_output}}});
         ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
-        if (use_gptj_residual_) {
-            // Original workflow:
-            //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
-            // Our workflow:
-            //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size)
-            // They are equivalent on math, but we can use same buffer for layer_input and layer_output
-            invokeAddBiasAttentionFfnResidual(layer_output,
-                                              ffn_output_,
-                                              self_attn_output_,
-                                              layer_input,
-                                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                              local_batch_size,
-                                              hidden_units_,
-                                              tensor_para_.world_size_,
-                                              stream_);
-            if (tensor_para_.world_size_ > 1) {
-                ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_);
-            }
-        }
-        else {
-            invokeAddBiasResidual(layer_output,
-                                  self_attn_output_,
-                                  llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                  local_batch_size,
-                                  hidden_units_,
-                                  stream_);
-        }
+        invokeAddBiasResidual(layer_output,
+                self_attn_output_,
+                llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                local_batch_size,
+                hidden_units_,
+                stream_);
 
         sync_check_cuda_error();
 
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
-            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
-            // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_,
-            // stream_);
-
-            ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+            int data_size = local_batch_size * hidden_units_;
+            ftNcclSend(layer_output,
                        data_size,
                        pipeline_para_.rank_ + 1,
                        pipeline_para_,
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h
index cbbc272ff..773637d65 100644
--- a/src/fastertransformer/models/llama/LLaMADecoder.h
+++ b/src/fastertransformer/models/llama/LLaMADecoder.h
@@ -52,11 +52,9 @@ class LLaMADecoder: public BaseLayer {
     size_t num_layer_;
     size_t rotary_embedding_dim_;
     bool   neox_rotary_style_;
-    bool   use_gptj_residual_;
     size_t hidden_units_;
     float  layernorm_eps_;
 
-    NcclParam tensor_para_;
     NcclParam pipeline_para_;
 
     std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
@@ -77,9 +75,7 @@ class LLaMADecoder: public BaseLayer {
                    size_t                              num_layer,
                    size_t                              rotary_embedding_dim,
                    bool                                neox_rotary_style,
-                   bool                                use_gptj_residual,
                    float                               layernorm_eps,
-                   NcclParam                           tensor_para,
                    NcclParam                           pipeline_para,
                    cudaStream_t                        stream,
                    cublasMMWrapper*                    cublas_wrapper,
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
index 9ed355047..412a1d076 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -23,13 +23,11 @@ template<typename T>
 LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const int  hidden_units,
                                                         const int  inter_size,
                                                         const int  tensor_para_size,
-                                                        const int  tensor_para_rank,
-                                                        const bool use_gptj_residual):
+                                                        const int  tensor_para_rank):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank),
-    use_gptj_residual_(use_gptj_residual)
+    tensor_para_rank_(tensor_para_rank)
 {
     mallocWeights();
     setWeightPtr();
@@ -40,7 +38,7 @@ LLaMADecoderLayerWeight<T>::~LLaMADecoderLayerWeight()
 {
     if (is_maintain_buffer == true) {
         for (int i = 0; i < 12; i++) {
-            if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) {
+            if (i != attention_dense_bias_weight_id) {
                 cudaFree(weights_ptr[i]);
             }
         }
@@ -67,8 +65,7 @@ LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh
     hidden_units_(other.hidden_units_),
     inter_size_(other.inter_size_),
     tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_)
+    tensor_para_rank_(other.tensor_para_rank_)
 {
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
@@ -76,9 +73,7 @@ LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
 
     cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
@@ -96,7 +91,6 @@ LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADec
     inter_size_        = other.inter_size_;
     tensor_para_size_  = other.tensor_para_size_;
     tensor_para_rank_  = other.tensor_para_rank_;
-    use_gptj_residual_ = other.use_gptj_residual_;
 
     mallocWeights();
 
@@ -105,9 +99,7 @@ LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADec
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
@@ -143,10 +135,7 @@ void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
                          dir_path + ".attention.dense.weight." + rank_spec + ".bin",
                          model_file_type);
 
-    if (!use_gptj_residual_) {
-        loadWeightFromBin<T>(
-            weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type);
-    }
+    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
@@ -160,14 +149,8 @@ void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
                          {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
                          model_file_type);
-    if (use_gptj_residual_) {
-        loadWeightFromBin<T>(
-            weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type);
-    }
-    else {
-        loadWeightFromBin<T>(
+    loadWeightFromBin<T>(
             weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type);
-    }
     loadWeightFromBin<T>(
         weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
@@ -182,7 +165,7 @@ void LLaMADecoderLayerWeight<T>::setWeightPtr()
     self_attention_weights.query_weight.kernel            = weights_ptr[2];
     self_attention_weights.query_weight.bias              = weights_ptr[3];
     self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
-    self_attention_weights.attention_output_weight.bias   = use_gptj_residual_ ? nullptr : weights_ptr[5];
+    self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
 
     ffn_weights.intermediate_weight.kernel = weights_ptr[6];
     ffn_weights.intermediate_weight.bias   = weights_ptr[7];
@@ -202,9 +185,7 @@ void LLaMADecoderLayerWeight<T>::mallocWeights()
     deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        deviceMalloc(&weights_ptr[5], hidden_units_);
-    }
+    deviceMalloc(&weights_ptr[5], hidden_units_);
 
     deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
index 44726f58c..4a6fc6a22 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
@@ -32,8 +32,7 @@ struct LLaMADecoderLayerWeight {
     LLaMADecoderLayerWeight(const int  hidden_units,
                               const int  inter_size,
                               const int  tensor_para_size  = 1,
-                              const int  tensor_para_rank  = 0,
-                              const bool use_gptj_residual = true);
+                              const int  tensor_para_rank  = 0);
     ~LLaMADecoderLayerWeight();
     LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other);
     LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other);
@@ -50,7 +49,6 @@ struct LLaMADecoderLayerWeight {
     int       inter_size_;
     int       tensor_para_size_;
     int       tensor_para_rank_;
-    bool      use_gptj_residual_;
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
     T*        weights_ptr[12];
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index dddf6eff6..f0bdc282f 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -23,31 +23,26 @@ LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_un
                                 const int                                  inter_size,
                                 const int                                  vocab_size,
                                 const int                                  num_layer,
-                                const int                                  max_seq_len,
                                 const int                                  tensor_para_size,
                                 const int                                  tensor_para_rank,
                                 const int                                  layer_para_size,
-                                const int                                  layer_para_rank,
-                                const bool                                 use_gptj_residual):
+                                const int                                  layer_para_rank):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     vocab_size_(vocab_size),
     num_layer_(num_layer),
-    max_seq_len_(max_seq_len),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank),
     layer_para_size_(layer_para_size),
-    layer_para_rank_(layer_para_rank),
-    use_gptj_residual_(use_gptj_residual)
+    layer_para_rank_(layer_para_rank)
 {
     FT_CHECK(num_layer_ % layer_para_size_ == 0);
 
-
     decoder_layer_weights.reserve(num_layer_);
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -82,12 +77,10 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
     inter_size_(other.inter_size_),
     vocab_size_(other.vocab_size_),
     num_layer_(other.num_layer_),
-    max_seq_len_(other.max_seq_len_),
     tensor_para_size_(other.tensor_para_size_),
     tensor_para_rank_(other.tensor_para_rank_),
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_),
     prompt_token_weight_size_(other.prompt_token_weight_size_)
 {
     mallocWeights();
@@ -113,12 +106,10 @@ LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
     inter_size_                 = other.inter_size_;
     vocab_size_                 = other.vocab_size_;
     num_layer_                  = other.num_layer_;
-    max_seq_len_                = other.max_seq_len_;
     tensor_para_size_           = other.tensor_para_size_;
     tensor_para_rank_           = other.tensor_para_rank_;
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
-    use_gptj_residual_          = other.use_gptj_residual_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
 
     mallocWeights();
@@ -169,6 +160,7 @@ void LLaMAWeight<T>::loadModel(std::string dir_path)
         weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type);
+
     loadWeightFromBin<T>(
         weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[3],
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
index 5f3c071e6..b372139e2 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.h
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -32,12 +32,10 @@ struct LLaMAWeight {
         const int                                  inter_size,
         const int                                  vocab_size,
         const int                                  num_layer,
-        const int                                  max_seq_len,
         const int                                  tensor_para_size     = 1,
         const int                                  tensor_para_rank     = 0,
         const int                                  layer_para_size      = 1,
-        const int                                  layer_para_rank      = 0,
-        const bool                                 use_gptj_residual_   = true);
+        const int                                  layer_para_rank      = 0);
 
     ~LLaMAWeight();
     LLaMAWeight(const LLaMAWeight& other);
@@ -49,18 +47,11 @@ struct LLaMAWeight {
 
     std::vector<LLaMADecoderLayerWeight<T>*> decoder_layer_weights;
     const T*                                   pre_decoder_embedding_table = nullptr;
-    // GPT-J does not use embedding table, but we leave the ptr such that
-    // LLaMA::forward and Gpt::forward become identical
     const T* position_encoding_table = nullptr;
 
     LayerNormWeight<T> post_decoder_layernorm;
     DenseWeight<T>     post_decoder_embedding;
 
-    inline void setMaxSeqLen(size_t max_seq_len)
-    {
-        max_seq_len_ = max_seq_len;
-    }
-
 private:
     void setWeightPtr();
     void mallocWeights();
@@ -70,16 +61,12 @@ struct LLaMAWeight {
     int inter_size_;
     int vocab_size_;
     int num_layer_;
-    int max_seq_len_;
 
     int tensor_para_size_;
     int tensor_para_rank_;
     int layer_para_size_;
     int layer_para_rank_;
 
-    // residual type
-    bool use_gptj_residual_;
-
     // prompt learning pair (task_name, (task_name_id, prompt_len))
     // each prompt token's weight size
     size_t prompt_token_weight_size_ = 0;

From ca0a25a4dabad2b0aa935aa15b342ecc13de05ca Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Wed, 13 Sep 2023 13:59:03 +0000
Subject: [PATCH 07/55] remove gpt dependency

---
 examples/cpp/llama/CMakeLists.txt         |  6 +-
 examples/cpp/llama/llama_example.cc       |  4 +-
 examples/cpp/llama/llama_example_utils.cc | 95 +++++++++++++++++++++++
 examples/cpp/llama/llama_example_utils.h  | 31 ++++++++
 4 files changed, 132 insertions(+), 4 deletions(-)
 create mode 100644 examples/cpp/llama/llama_example_utils.cc
 create mode 100644 examples/cpp/llama/llama_example_utils.h

diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
index ce0bee75f..19fb6e7fc 100644
--- a/examples/cpp/llama/CMakeLists.txt
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+add_library(llama_example_utils STATIC llama_example_utils.cc)
+target_link_libraries(llama_example_utils PUBLIC -lcublas -lcublasLt -lcudart
+                      nvtx_utils mpi_utils nccl_utils)
+
 add_executable(llama_example llama_example.cc)
 target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
     LLaMA mpi_utils nccl_utils nvtx_utils 
-    gpt_example_utils word_list)
+    llama_example_utils word_list)
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 62919d57a..4d0d60a93 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -19,11 +19,9 @@
 #include "src/fastertransformer/utils/nccl_utils.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 #include "src/fastertransformer/utils/word_list.h"
+#include "examples/cpp/llama/llama_example_utils.h"
 #include "3rdparty/INIReader.h"
 
-// Remove LATER
-#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
-
 #include <cuda_profiler_api.h>
 #include <fstream>
 #include <sstream>
diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc
new file mode 100644
index 000000000..77f621dbf
--- /dev/null
+++ b/examples/cpp/llama/llama_example_utils.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "examples/cpp/llama/llama_example_utils.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <fstream>
+#include <sstream>
+
+namespace fastertransformer {
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         end_id,
+                   const int         beam_width,
+                   std::string       file_name)
+{
+    std::vector<std::vector<int>> tmp_start_ids;
+    std::vector<int>              tmp_start_lengths;
+
+    std::ifstream start_id_file(file_name, std::ios::in);
+    int           line_num = 0;
+    if (start_id_file.is_open()) {
+        std::string line;
+        while (std::getline(start_id_file, line)) {
+            std::stringstream lineStream(line);
+            std::string       vals;
+            int               i1 = 0;
+            std::vector<int>  tmp_vec;
+            while (std::getline(lineStream, vals, ',')) {
+                tmp_vec.push_back(std::stoi(vals));
+                i1++;
+            }
+            tmp_start_ids.push_back(tmp_vec);
+            tmp_start_lengths.push_back(i1);
+            line_num++;
+        }
+        if (batch_size == 0) {
+            batch_size = line_num;
+        }
+    }
+    else {
+        printf("[WARNING] Cannot open the file '%s'. \n", file_name.c_str());
+        max_input_len = 0;
+        return 0;
+    }
+
+    max_input_len = tmp_start_lengths.data()[0];
+    for (uint i = 1; i < (uint)tmp_start_lengths.size(); i++) {
+        max_input_len = max_input_len > tmp_start_lengths.data()[i] ? max_input_len : tmp_start_lengths.data()[i];
+    }
+
+    while ((int)tmp_start_lengths.size() < batch_size) {
+        std::vector<int> padding_ids;
+        for (int i = 0; i < max_input_len; i++) {
+            padding_ids.push_back(end_id);
+        }
+        tmp_start_ids.push_back(padding_ids);
+        tmp_start_lengths.push_back(max_input_len);
+    }
+
+    // Add padding
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {
+            tmp_start_ids[i].push_back(end_id);
+        }
+    }
+
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int b = 0; b < beam_width; b++) {
+            for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) {
+                v_start_ids->push_back(tmp_start_ids[i][j]);
+            }
+            v_start_lengths->push_back(tmp_start_lengths[i]);
+        }
+    }
+    return batch_size;
+}
+
+}  // namespace fastertransformer
diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h
new file mode 100644
index 000000000..911cdf49a
--- /dev/null
+++ b/examples/cpp/llama/llama_example_utils.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace fastertransformer {
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         end_id,
+                   const int         beam_width,
+                   std::string       file_name);
+}  // namespace fastertransformer

From 662d3b6cd2ccf7ed5c4222eea259bd1dd2278e7b Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 15 Sep 2023 08:46:19 +0000
Subject: [PATCH 08/55] fix loadModel to load llama & fix
 invokeGeneralLLaMALayerNorm to invoke RMSNorm

---
 examples/cpp/llama/llama_example.cc           |  47 +-
 examples/cpp/llama/llama_example_utils.h      |   1 +
 .../kernels/layernorm_kernels.cu              |  94 ++++
 .../kernels/layernorm_kernels.h               |  13 +-
 src/fastertransformer/models/llama/LLaMA.cc   | 408 ++++++------------
 src/fastertransformer/models/llama/LLaMA.h    | 100 ++---
 .../models/llama/LLaMAContextDecoder.cc       | 167 +++----
 .../models/llama/LLaMAContextDecoder.h        |  38 +-
 .../models/llama/LLaMADecoder.cc              | 129 +++---
 .../models/llama/LLaMADecoderLayerWeight.cc   | 186 ++++----
 .../models/llama/LLaMADecoderLayerWeight.h    |   9 +-
 .../models/llama/LLaMAWeight.cc               |  61 ++-
 .../models/llama/LLaMAWeight.h                |   4 -
 13 files changed, 579 insertions(+), 678 deletions(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 4d0d60a93..c1f4521bf 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "3rdparty/INIReader.h"
+#include "examples/cpp/llama/llama_example_utils.h"
 #include "src/fastertransformer/models/llama/LLaMA.h"
 #include "src/fastertransformer/utils/mpi_utils.h"
 #include "src/fastertransformer/utils/nccl_utils.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 #include "src/fastertransformer/utils/word_list.h"
-#include "examples/cpp/llama/llama_example_utils.h"
-#include "3rdparty/INIReader.h"
 
 #include <cuda_profiler_api.h>
 #include <fstream>
@@ -71,9 +71,9 @@ int main(int argc, char* argv[])
 template<typename T>
 void llama_example(const INIReader reader)
 {
-    const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name");
-    std::string       model_dir  = std::string(reader.Get("ft_instance_hyperparameter", "model_dir"));
-    int pipeline_para_size       = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    const std::string model_name         = reader.Get("ft_instance_hyperparameter", "model_name");
+    std::string       model_dir          = std::string(reader.Get("ft_instance_hyperparameter", "model_dir"));
+    int               pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -85,7 +85,7 @@ void llama_example(const INIReader reader)
     const int    end_id               = reader.GetInteger(model_name, "end_id");
 
     const size_t hidden_units = head_num * size_per_head;
-    const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of -1) / multiple_of);
+    const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of);
 
     const size_t beam_width         = reader.GetInteger("request", "beam_width");
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
@@ -207,8 +207,6 @@ void llama_example(const INIReader reader)
                                                     inter_size,
                                                     vocab_size,
                                                     decoder_layers,
-                                                    tensor_para.world_size_,
-                                                    tensor_para.rank_,
                                                     pipeline_para.world_size_,
                                                     pipeline_para.rank_);
 
@@ -231,22 +229,22 @@ void llama_example(const INIReader reader)
                                                        true);  // causal_mask
 
     LLaMA<T> llama = LLaMA<T>(head_num,
-                                size_per_head,
-                                inter_size,
-                                decoder_layers,
-                                vocab_size,
-                                rotary_embedding_dim,
-                                start_id,
-                                end_id,
-                                random_seed,
-                                tensor_para,
-                                pipeline_para,
-                                stream,
-                                &cublas_wrapper,
-                                &allocator,
-                                false,
-                                &prop,
-                                attention_type);
+                              size_per_head,
+                              inter_size,
+                              decoder_layers,
+                              vocab_size,
+                              rotary_embedding_dim,
+                              start_id,
+                              end_id,
+                              random_seed,
+                              tensor_para,
+                              pipeline_para,
+                              stream,
+                              &cublas_wrapper,
+                              &allocator,
+                              false,
+                              &prop,
+                              attention_type);
 
     int* d_output_ids;
     int* d_sequence_lengths;
@@ -265,7 +263,6 @@ void llama_example(const INIReader reader)
         {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, start_ids.data()}},
         {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, end_ids.data()}}};
 
-
     input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h
index 911cdf49a..1e5d0b9ab 100644
--- a/examples/cpp/llama/llama_example_utils.h
+++ b/examples/cpp/llama/llama_example_utils.h
@@ -28,4 +28,5 @@ int read_start_ids(size_t            batch_size,
                    const int         end_id,
                    const int         beam_width,
                    std::string       file_name);
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
index 369030b37..b19e9ac73 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.cu
+++ b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -1859,6 +1859,100 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16*       out,
                                        cudaStream_t         stream);
 #endif
 
+/*******************  invokeGeneralLLaMALayerNorm  ***********************/
+
+template<typename T>
+__global__ void generalLLaMALayerNorm(const T* __restrict input,
+                                      const T* __restrict gamma,
+                                      const T* __restrict beta,
+                                      T*          normed_output,
+                                      const float layernorm_eps,
+                                      int         m,
+                                      int         n)
+{
+    const int tid = threadIdx.x;
+
+    extern __shared__ __align__(sizeof(float)) char _shmem[];
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    __shared__ float s_mean_sq;
+    float            mean_sq = 0.0f;
+
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    float local_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float val = (float)(ldg(&input[blockIdx.x * n + i]));
+        local_sum += val * val;
+    }
+
+    mean_sq = blockReduceSum(local_sum);
+
+    if (threadIdx.x == 0) {
+        s_mean_sq = rsqrtf(mean_sq / (float)n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        const int index    = blockIdx.x * n + i;
+        float     beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]);
+        T         val      = (T)(((float)input[index] * s_mean_sq) * (float)(ldg(&gamma[i])) + beta_val);
+
+        normed_output[index] = val;
+    }
+}
+
+template<typename T>
+void invokeGeneralLLaMALayerNorm(T*           out,
+                                 const T*     input,
+                                 const T*     gamma,
+                                 const T*     beta,
+                                 const float  layernorm_eps,
+                                 const int    m,
+                                 const int    n,
+                                 cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+
+    /* For general cases, n is equal to hidden_units, e.g., 512/1024.
+       Since we have warp shuffle inside the code, block.x % 32 should be 0.
+     */
+    if (n % 32 != 0) {
+        block.x = 1024;
+    }
+
+    generalLLaMALayerNorm<T><<<grid, block, 0, stream>>>(input, gamma, beta, out, layernorm_eps, m, n);
+}
+
+template void invokeGeneralLLaMALayerNorm(float*       out,
+                                          const float* input,
+                                          const float* gamma,
+                                          const float* beta,
+                                          const float  layernorm_eps,
+                                          const int    m,
+                                          const int    n,
+                                          cudaStream_t stream);
+template void invokeGeneralLLaMALayerNorm(half*        out,
+                                          const half*  input,
+                                          const half*  gamma,
+                                          const half*  beta,
+                                          const float  layernorm_eps,
+                                          const int    m,
+                                          const int    n,
+                                          cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeGeneralLLaMALayerNorm(__nv_bfloat16*       out,
+                                          const __nv_bfloat16* input,
+                                          const __nv_bfloat16* gamma,
+                                          const __nv_bfloat16* beta,
+                                          const float          layernorm_eps,
+                                          const int            m,
+                                          const int            n,
+                                          cudaStream_t         stream);
+#endif
+
 /*******************  invokeLayernormShiftPartition  ***********************/
 
 // applied to half2 and bfloat162
diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h
index d8ac09234..5c5c03c7a 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.h
+++ b/src/fastertransformer/kernels/layernorm_kernels.h
@@ -24,7 +24,8 @@
 
 namespace fastertransformer {
 
-enum class LayerNormType {
+enum class LayerNormType
+{
     pre_layernorm,
     post_layernorm,
     InvalidType
@@ -161,6 +162,16 @@ void invokeGeneralT5LayerNorm(T*           out,
                               const int    n,
                               cudaStream_t stream);
 
+template<typename T>
+void invokeGeneralLLaMALayerNorm(T*           out,
+                                 const T*     input,
+                                 const T*     gamma,
+                                 const T*     beta,
+                                 const float  layernorm_eps,
+                                 const int    m,
+                                 const int    n,
+                                 cudaStream_t stream);
+
 template<typename T>
 void invokeGeneralAddResidualT5PreLayerNorm(T*           output,
                                             T*           norm_output,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 3734b63d5..9fcab580b 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -16,8 +16,8 @@
 
 #include "src/fastertransformer/models/llama/LLaMA.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
 #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
 #include <algorithm>
 
@@ -59,7 +59,7 @@ void LLaMA<T>::initialize()
                                          enable_custom_all_reduce_);
 
     dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
-                                                          vocab_size_padded_,
+                                                          vocab_size_,
                                                           0,  // end_id, deprecated
                                                           stream_,
                                                           cublas_wrapper_,
@@ -79,29 +79,24 @@ void LLaMA<T>::allocateBuffer(
     size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t batchxbeam      = batch_size * beam_width;
-    const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len
-                                   * hidden_units_ / tensor_para_.world_size_;
-
-    if (vocab_size_ != vocab_size_padded_) {
-        padded_embedding_kernel_ =
-            (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true));
-        padded_embedding_kernel_ptr_ = padded_embedding_kernel_;
-
-        padded_embedding_bias_ =
-            (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true));
-    }
-
-    input_attention_mask_ = (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
-    decoder_input_buf_    = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    decoder_output_buf_   = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    normed_decoder_output_buf_ = (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    logits_buf_           = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
-    nccl_logits_buf_      = (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
-    cum_log_probs_        = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
-    finished_buf_         = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
-    h_finished_buf_       = new bool[batchxbeam];
-    sequence_lengths_     = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+    const size_t batchxbeam = batch_size * beam_width;
+    const size_t self_cache_size =
+        (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len * hidden_units_;
+
+    input_attention_mask_ = (T*)(allocator_->reMalloc(
+        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
+    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false));
+    nccl_logits_buf_ =
+        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false));
+    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
+    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
+    h_finished_buf_   = new bool[batchxbeam];
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;
@@ -111,12 +106,6 @@ void LLaMA<T>::allocateBuffer(
         cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len;
     }
 
-    // prompt_learning weight batch ptrs
-    prompt_learning_weight_batch_ =
-        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
-    tiled_prompt_lengths_buf_ =
-        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
-
     tiled_input_ids_buf_ =
         (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
     tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
@@ -149,12 +138,6 @@ template<typename T>
 void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        if (vocab_size_ != vocab_size_padded_) {
-            padded_embedding_kernel_ptr_ = nullptr;
-            allocator_->free((void**)(&padded_embedding_kernel_));
-            allocator_->free((void**)(&padded_embedding_bias_));
-        }
-
         allocator_->free((void**)(&input_attention_mask_));
         allocator_->free((void**)(&decoder_input_buf_));
         allocator_->free((void**)(&decoder_output_buf_));
@@ -171,9 +154,6 @@ void LLaMA<T>::freeBuffer()
             allocator_->free((void**)(&cache_indirections_)[0]);
         }
 
-        allocator_->free((void**)(&prompt_learning_weight_batch_));
-        allocator_->free((void**)(&tiled_prompt_lengths_buf_));
-
         allocator_->free((void**)(&tiled_input_ids_buf_));
         allocator_->free((void**)(&tiled_input_lengths_buf_));
         allocator_->free((void**)(&tiled_total_padding_count_));
@@ -199,22 +179,22 @@ void LLaMA<T>::freeBuffer()
 
 template<typename T>
 LLaMA<T>::LLaMA(size_t                              head_num,
-                    size_t                              size_per_head,
-                    size_t                              inter_size,
-                    size_t                              num_layer,
-                    size_t                              vocab_size,
-                    size_t                              rotary_embedding_dim,
-                    int                                 start_id,
-                    int                                 end_id,
-                    unsigned long long                  random_seed,
-                    cudaStream_t                        stream,
-                    cublasMMWrapper*                    cublas_wrapper,
-                    IAllocator*                         allocator,
-                    bool                                is_free_buffer_after_forward,
-                    cudaDeviceProp*                     cuda_device_prop,
-                    AttentionType                       attention_type,
-                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                    int                                 enable_custom_all_reduce):
+                size_t                              size_per_head,
+                size_t                              inter_size,
+                size_t                              num_layer,
+                size_t                              vocab_size,
+                size_t                              rotary_embedding_dim,
+                int                                 start_id,
+                int                                 end_id,
+                unsigned long long                  random_seed,
+                cudaStream_t                        stream,
+                cublasMMWrapper*                    cublas_wrapper,
+                IAllocator*                         allocator,
+                bool                                is_free_buffer_after_forward,
+                cudaDeviceProp*                     cuda_device_prop,
+                AttentionType                       attention_type,
+                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -225,42 +205,33 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     start_id_(start_id),
     end_id_(end_id),
     hidden_units_(head_num * size_per_head),
-    local_head_num_(head_num / 1),
     attention_type_(attention_type)
 {
-    tensor_para_.world_size_   = 1;
-    tensor_para_.rank_         = 0;
     pipeline_para_.world_size_ = 1;
     pipeline_para_.rank_       = 0;
-
-    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
-    if (std::is_same<half, T>::value) {
-        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
-    }
-    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
     initialize();
 }
 
 template<typename T>
 LLaMA<T>::LLaMA(size_t                              head_num,
-                    size_t                              size_per_head,
-                    size_t                              inter_size,
-                    size_t                              num_layer,
-                    size_t                              vocab_size,
-                    size_t                              rotary_embedding_dim,
-                    int                                 start_id,
-                    int                                 end_id,
-                    unsigned long long                  random_seed,
-                    NcclParam                           tensor_para,
-                    NcclParam                           pipeline_para,
-                    cudaStream_t                        stream,
-                    cublasMMWrapper*                    cublas_wrapper,
-                    IAllocator*                         allocator,
-                    bool                                is_free_buffer_after_forward,
-                    cudaDeviceProp*                     cuda_device_prop,
-                    AttentionType                       attention_type,
-                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                    int                                 enable_custom_all_reduce):
+                size_t                              size_per_head,
+                size_t                              inter_size,
+                size_t                              num_layer,
+                size_t                              vocab_size,
+                size_t                              rotary_embedding_dim,
+                int                                 start_id,
+                int                                 end_id,
+                unsigned long long                  random_seed,
+                NcclParam                           tensor_para,
+                NcclParam                           pipeline_para,
+                cudaStream_t                        stream,
+                cublasMMWrapper*                    cublas_wrapper,
+                IAllocator*                         allocator,
+                bool                                is_free_buffer_after_forward,
+                cudaDeviceProp*                     cuda_device_prop,
+                AttentionType                       attention_type,
+                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -271,18 +242,11 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     start_id_(start_id),
     end_id_(end_id),
     hidden_units_(head_num * size_per_head),
-    tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
-    local_head_num_(head_num / tensor_para.world_size_),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce),
     attention_type_(attention_type)
 {
-    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
-    if (std::is_same<half, T>::value) {
-        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
-    }
-    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
     initialize();
 }
 
@@ -297,12 +261,8 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     rotary_embedding_dim_(llama.rotary_embedding_dim_),
     start_id_(llama.start_id_),
     end_id_(llama.end_id_),
-    prompt_learning_start_id_(llama.prompt_learning_start_id_),
     hidden_units_(llama.hidden_units_),
-    tensor_para_(llama.tensor_para_),
     pipeline_para_(llama.pipeline_para_),
-    local_head_num_(llama.local_head_num_),
-    vocab_size_padded_(llama.vocab_size_padded_),
     custom_all_reduce_comm_(llama.custom_all_reduce_comm_),
     enable_custom_all_reduce_(llama.enable_custom_all_reduce_),
     attention_type_(llama.attention_type_)
@@ -335,16 +295,16 @@ void LLaMA<T>::unRegisterCallback()
 
 template<typename T>
 void LLaMA<T>::forward(std::vector<Tensor>*       output_tensors,
-                         const std::vector<Tensor>* input_tensors,
-                         const LLaMAWeight<T>*    llama_weights)
+                       const std::vector<Tensor>* input_tensors,
+                       const LLaMAWeight<T>*      llama_weights)
 {
     FT_CHECK(false);
 }
 
 template<typename T>
 void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
-                         const std::unordered_map<std::string, Tensor>* input_tensors,
-                         const LLaMAWeight<T>*                        llama_weights)
+                       const std::unordered_map<std::string, Tensor>* input_tensors,
+                       const LLaMAWeight<T>*                          llama_weights)
 {
     // input_tensors:
     //      input_ids [batch_size, max_input_length]
@@ -385,13 +345,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const size_t batch_size = output_tensors->at("output_ids").shape[0];
     const size_t beam_width = output_tensors->at("output_ids").shape[1];
 
-
-    // Prefix Prompt Inputs
-    // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes
-    // pad)
-    // TODO (perkzz): move unnecessary paddings
-    int max_prefix_prompt_length = 0;
-
     // NOTE: Prefix Prompt PreProcessing
     // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
     // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
@@ -431,15 +384,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
                                                     batch_size * beam_width,
-                                                    local_head_num_,
+                                                    head_num_,
                                                     size_per_head_ / (16 / sizeof(T)),
                                                     max_cache_seq_len,
                                                     16 / sizeof(T)};
-    const std::vector<size_t> self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_,
-                                                    batch_size * beam_width,
-                                                    local_head_num_,
-                                                    max_cache_seq_len,
-                                                    size_per_head_};
+    const std::vector<size_t> self_v_cache_shape = {
+        num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, head_num_, max_cache_seq_len, size_per_head_};
 
     // initialize the output ids and parent ids
     cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
@@ -452,8 +402,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     sync_check_cuda_error();
 
+    std::cout << __FILE__ << ":" << __LINE__ << "\n";
+
     // handle first step
     if (max_input_length > 1) {
+        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         invokeTileGptInputs(tiled_input_ids_buf_,
                             tiled_input_lengths_buf_,
                             input_tensors->at("input_ids").getPtr<int>(),
@@ -465,22 +418,22 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
 
         invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                output_ids_buf_,
-                llama_weights->pre_decoder_embedding_table,
-                llama_weights->position_encoding_table,
-                pPromptTuningParam<T>{},  // no p/prompt tuning
-                tiled_input_ids_buf_,
-                1,
-                max_input_length,
-                max_input_length,
-                batch_size * beam_width,
-                hidden_units_,
-                stream_);
+                                                 output_ids_buf_,
+                                                 llama_weights->pre_decoder_embedding_table,
+                                                 llama_weights->position_encoding_table,
+                                                 pPromptTuningParam<T>{},  // no p/prompt tuning
+                                                 tiled_input_ids_buf_,
+                                                 1,
+                                                 max_input_length,
+                                                 max_input_length,
+                                                 batch_size * beam_width,
+                                                 hidden_units_,
+                                                 stream_);
         sync_check_cuda_error();
 
         invokeBuildDecoderAttentionMask(input_attention_mask_,
                                         tiled_input_lengths_buf_,
-                                        tiled_prompt_lengths_buf_,
+                                        nullptr,
                                         batch_size * beam_width,
                                         max_input_length,
                                         0,
@@ -496,22 +449,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
-                    {batch_size * beam_width,
-                     1,
-                     (size_t)max_input_length,
-                     (size_t)(max_input_length)},
+                    {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)(max_input_length)},
                     input_attention_mask_}},
-            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}},
-            {"d_prefix_prompt_batch",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {batch_size * beam_width},
-                    nullptr}},
-            {"d_prefix_prompt_lengths",
-             Tensor{MEMORY_GPU,
-                    TYPE_INT32,
-                    {batch_size * beam_width},
-                    nullptr}}};
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}};
 
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
@@ -524,9 +464,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             {"last_token_hidden_units",
              Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
 
+        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         llama_context_decoder_->forward(
             &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
         sync_check_cuda_error();
+        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
                                  nullptr,
@@ -537,7 +479,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                  max_input_length - 1,
                                  stream_);
         sync_check_cuda_error();
-    } else if (max_input_length == 0) {
+    }
+    else if (max_input_length == 0) {
         max_input_length++;
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
@@ -555,7 +498,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         cudaMemcpyHostToDevice,
                         stream_);
         sync_check_cuda_error();
-    } else if (max_input_length == 1) {
+    }
+    else if (max_input_length == 1) {
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
                                  nullptr,
@@ -582,27 +526,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         cudaMemcpyDeviceToDevice,
                         stream_);
     }
-
-    if (vocab_size_ == vocab_size_padded_) {
-        padded_embedding_kernel_ptr_ = llama_weights->post_decoder_embedding.kernel;
-    }
-    else {
-        cudaMemcpyAsync(padded_embedding_kernel_,
-                        llama_weights->post_decoder_embedding.kernel,
-                        sizeof(T) * vocab_size_ * hidden_units_,
-                        cudaMemcpyDeviceToDevice,
-                        stream_);
-        cudaMemcpyAsync(padded_embedding_bias_,
-                        llama_weights->post_decoder_embedding.bias,
-                        sizeof(T) * vocab_size_,
-                        cudaMemcpyDeviceToDevice,
-                        stream_);
-        sync_check_cuda_error();
-    }
+    std::cout << __FILE__ << ":" << __LINE__ << "\n";
 
     invokeMaskPaddingTokens(masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                            tiled_prompt_lengths_buf_,
+                            nullptr,
                             max_cache_seq_len,
                             max_input_length,
                             0,
@@ -611,6 +539,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             stream_);
 
     for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
+        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         const int src_indir_idx = (step - max_input_length) % 2;
         const int tgt_indir_idx = 1 - src_indir_idx;
 
@@ -622,7 +551,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         for (uint ite = 0; ite < iteration_num; ++ite) {
             const int id_offset               = ite * local_batch_size * beam_width;
             const int hidden_units_offset     = id_offset * hidden_units_;
-            const int vocab_size_units_offset = id_offset * vocab_size_padded_;
+            const int vocab_size_units_offset = id_offset * vocab_size_;
 
             if (!(max_input_length > 1 && step == max_input_length)) {
                 if (pipeline_para_.rank_ == 0) {
@@ -655,12 +584,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             TYPE_INT32,
                             {local_batch_size * beam_width},
                             tiled_total_padding_count_ + id_offset}},
-                    {"d_prefix_prompt_lengths",
-                     Tensor{MEMORY_GPU,
-                            TYPE_INT32,
-                            {local_batch_size},
-                            nullptr}},
-                    {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}},
                     {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
                     {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
                     {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}},
@@ -688,85 +611,42 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             }
 
             if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
-                invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
-                                       decoder_output_buf_ + hidden_units_offset,
-                                       llama_weights->post_decoder_layernorm.gamma,
-                                       llama_weights->post_decoder_layernorm.beta,
-                                       layernorm_eps_,
-                                       local_batch_size * beam_width,
-                                       hidden_units_,
-                                       (float*)nullptr,
-                                       0,
-                                       stream_);
+                invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
+                                            decoder_output_buf_ + hidden_units_offset,
+                                            llama_weights->post_decoder_layernorm.gamma,
+                                            llama_weights->post_decoder_layernorm.beta,
+                                            layernorm_eps_,
+                                            local_batch_size * beam_width,
+                                            hidden_units_,
+                                            stream_);
                 sync_check_cuda_error();
 
-                if (tensor_para_.world_size_ == 1) {
-                    float alpha = 1.0f;
-                    float beta  = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          vocab_size_padded_,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          logits_buf_ + vocab_size_units_offset,
-                                          CUDA_R_32F,
-                                          vocab_size_padded_, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                }
-                else {
-                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
-                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
-                    float     alpha            = 1.0f;
-                    float     beta             = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          local_vocab_size,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_
-                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          nccl_logits_buf_ + vocab_size_units_offset
-                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
-                                          CUDA_R_32F,
-                                          local_vocab_size, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
-                                    nccl_logits_buf_ + vocab_size_units_offset,
-                                    local_batch_size * beam_width * local_vocab_size,
-                                    tensor_para_.rank_,
-                                    tensor_para_,
-                                    stream_);
-                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
-                                          nccl_logits_buf_ + vocab_size_units_offset,
-                                          tensor_para_.world_size_,
-                                          local_batch_size * beam_width,
-                                          local_vocab_size,
-                                          stream_);
-                }
+                float alpha = 1.0f;
+                float beta  = 0.0f;
+                cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                      CUBLAS_OP_N,
+                                      vocab_size_,
+                                      local_batch_size * beam_width,
+                                      hidden_units_,  // k
+                                      &alpha,
+                                      llama_weights->post_decoder_embedding.kernel,
+                                      gemm_data_type,
+                                      hidden_units_,  // k
+                                      normed_decoder_output_buf_ + hidden_units_offset,
+                                      gemm_data_type,
+                                      hidden_units_,  // k
+                                      &beta,
+                                      logits_buf_ + vocab_size_units_offset,
+                                      CUDA_R_32F,
+                                      vocab_size_,
+                                      CUDA_R_32F,
+                                      cublasGemmAlgo_t(-1));
 
                 int                                     tmp_local_batch_size       = local_batch_size;
                 bool                                    is_initialize_random_table = step == max_input_length;
                 std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
-                    {"logits",
-                     Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}},
-                    // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}},
+                    {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_}, logits_buf_}},
+                    // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_}, nullptr}},
                     {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
                     {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
                     {"input_lengths",
@@ -854,7 +734,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             }
             ftNcclGroupEnd();
             // throw errors when detected
-            ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+            NcclParam tensor_para(0, 1);
+            ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
             sync_check_cuda_error();
         }
 
@@ -865,14 +746,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
             sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
 
-            if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) {
+            if (pipeline_para_.rank_ == 0) {
                 token_generated_cb_(output_tensors, token_generated_ctx_);
             }
         }
         if (step == max_input_length) {
             /* We have just finished processing input: update the padding count:
              * total_padding_count += (max_input_length - input_lengths)
-             * if has prefix prompts, += (max_prefix_prompt_length - prompt_length)
              */
             invokeUpdatePaddingCount(tiled_total_padding_count_,
                                      input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
@@ -884,6 +764,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                      stream_);
         }
     }
+    std::cout << __FILE__ << ":" << __LINE__ << "\n";
 
     setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
     sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
@@ -891,15 +772,16 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
 template<typename T>
 void LLaMA<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                                const std::unordered_map<std::string, Tensor>* input_tensors)
+                                              const std::unordered_map<std::string, Tensor>* input_tensors)
 {
+    NcclParam tensor_para(0, 1);
+
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     if (pipeline_para_.world_size_ == 1) {
         // throw errors when detected
-        ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+        ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
         return;
     }
-
     const auto pp_rank = pipeline_para_.rank_;
 
     ftNcclGroupStart();
@@ -921,14 +803,14 @@ void LLaMA<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Te
     }
     ftNcclGroupEnd();
     // throw errors when detected
-    ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+    ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
 }
 
 template<typename T>
 void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                  const std::unordered_map<std::string, Tensor>* input_tensors,
-                                  const size_t                                   max_input_length,
-                                  const size_t                                   max_output_seq_len)
+                                const std::unordered_map<std::string, Tensor>* input_tensors,
+                                const size_t                                   max_input_length,
+                                const size_t                                   max_output_seq_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) {
@@ -981,15 +863,15 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         param.beams                = transposed_output_ids_buf_;
         param.max_sequence_lengths = sequence_lengths_;
         // add sequence_length 1 here because the sequence_length of time step t is t - 1
-        param.max_sequence_length_final_step = 1;
-        param.max_time                       = max_output_seq_len;
-        param.batch_size                     = batch_size;
-        param.beam_width                     = beam_width;
-        param.step_ids                       = output_ids_buf_;
-        param.parent_ids                     = beam_width == 1 ? nullptr : parent_ids_buf_;
-        param.end_tokens                     = end_ids_buf_;
-        param.max_input_length               = max_input_length;
-        param.prefix_soft_prompt_lengths     = nullptr;
+        param.max_sequence_length_final_step  = 1;
+        param.max_time                        = max_output_seq_len;
+        param.batch_size                      = batch_size;
+        param.beam_width                      = beam_width;
+        param.step_ids                        = output_ids_buf_;
+        param.parent_ids                      = beam_width == 1 ? nullptr : parent_ids_buf_;
+        param.end_tokens                      = end_ids_buf_;
+        param.max_input_length                = max_input_length;
+        param.prefix_soft_prompt_lengths      = nullptr;
         param.input_lengths                   = tiled_input_lengths_buf_;
         param.max_prefix_soft_prompt_length   = 0;
         param.max_input_without_prompt_length = max_input_length;
@@ -1029,18 +911,6 @@ size_t LLaMA<T>::getPipelineParallelSize()
     return pipeline_para_.world_size_;
 }
 
-template<typename T>
-size_t LLaMA<T>::getTensorParallelRank()
-{
-    return tensor_para_.rank_;
-}
-
-template<typename T>
-size_t LLaMA<T>::getTensorParallelSize()
-{
-    return tensor_para_.world_size_;
-}
-
 template<typename T>
 bool* LLaMA<T>::getFinishBuffer()
 {
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 2f4f52c7b..7a66a2ebf 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -24,7 +24,6 @@
 #include "src/fastertransformer/models/llama/LLaMADecoder.h"
 #include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/prompt_learning.h"
 
 namespace fastertransformer {
 
@@ -40,13 +39,12 @@ class LLaMA: public BaseLayer {
     size_t rotary_embedding_dim_;
 
     static constexpr bool  neox_rotary_style_ = true;
-    static constexpr float layernorm_eps_     = 1e-5f;
+    static constexpr float layernorm_eps_     = 1e-6f;
 
     int    start_id_;
     int    end_id_;
     size_t hidden_units_;
 
-    size_t    local_head_num_;
     NcclParam tensor_para_;
     NcclParam pipeline_para_;
 
@@ -55,19 +53,11 @@ class LLaMA: public BaseLayer {
 
     AttentionType attention_type_;
 
-    size_t     vocab_size_padded_;
-    const bool is_context_qk_buf_float_ =
-        (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr ||
-         std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
+    const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
+                                           || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
-    // Prompt Learning Parameters
-    PromptLearningType prompt_learning_type_;
-    int                prompt_learning_start_id_;  // start_id for prompt_learning (only needed by prefix prompts)
-    bool               has_prefix_prompt_;
-    bool               has_prefix_soft_prompt_;
-
-    LLaMADecoder<T>*         llama_decoder_;
-    LLaMAContextDecoder<T>*  llama_context_decoder_;
+    LLaMADecoder<T>*           llama_decoder_;
+    LLaMAContextDecoder<T>*    llama_context_decoder_;
     DynamicDecodeLayer<float>* dynamic_decode_layer_;
 
     void allocateBuffer() override;
@@ -78,10 +68,6 @@ class LLaMA: public BaseLayer {
     void initialize();
 
 protected:
-    T*       padded_embedding_kernel_;
-    T*       padded_embedding_bias_;
-    const T* padded_embedding_kernel_ptr_;
-
     T* input_attention_mask_;
 
     T* decoder_input_buf_;
@@ -102,10 +88,6 @@ class LLaMA: public BaseLayer {
     T*   value_cache_;
     int* cache_indirections_[2] = {nullptr, nullptr};
 
-    // prompt_learning weight_batch ptrs
-    const T** prompt_learning_weight_batch_;
-    int*      tiled_prompt_lengths_buf_;  // only needed by prefix prompts
-
     int*  tiled_input_ids_buf_;
     int*  tiled_input_lengths_buf_;
     int*  transposed_output_ids_buf_;
@@ -135,42 +117,42 @@ class LLaMA: public BaseLayer {
 
 public:
     LLaMA(size_t                              head_num,
-            size_t                              size_per_head,
-            size_t                              inter_size,
-            size_t                              num_layer,
-            size_t                              vocab_size,
-            size_t                              rotary_embedding_dim,
-            int                                 start_id,
-            int                                 end_id,
-            unsigned long long                  random_seed,
-            cudaStream_t                        stream,
-            cublasMMWrapper*                    cublas_wrapper,
-            IAllocator*                         allocator,
-            bool                                is_free_buffer_after_forward,
-            cudaDeviceProp*                     cuda_device_prop         = nullptr,
-            AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
-            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-            int                                 enable_custom_all_reduce = 0);
+          size_t                              size_per_head,
+          size_t                              inter_size,
+          size_t                              num_layer,
+          size_t                              vocab_size,
+          size_t                              rotary_embedding_dim,
+          int                                 start_id,
+          int                                 end_id,
+          unsigned long long                  random_seed,
+          cudaStream_t                        stream,
+          cublasMMWrapper*                    cublas_wrapper,
+          IAllocator*                         allocator,
+          bool                                is_free_buffer_after_forward,
+          cudaDeviceProp*                     cuda_device_prop         = nullptr,
+          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+          int                                 enable_custom_all_reduce = 0);
 
     LLaMA(size_t                              head_num,
-            size_t                              size_per_head,
-            size_t                              inter_size,
-            size_t                              num_layer,
-            size_t                              vocab_size,
-            size_t                              rotary_embedding_dim,
-            int                                 start_id,
-            int                                 end_id,
-            unsigned long long                  random_seed,
-            NcclParam                           tensor_para,
-            NcclParam                           pipeline_para,
-            cudaStream_t                        stream,
-            cublasMMWrapper*                    cublas_wrapper,
-            IAllocator*                         allocator,
-            bool                                is_free_buffer_after_forward,
-            cudaDeviceProp*                     cuda_device_prop         = nullptr,
-            AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
-            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-            int                                 enable_custom_all_reduce = 0);
+          size_t                              size_per_head,
+          size_t                              inter_size,
+          size_t                              num_layer,
+          size_t                              vocab_size,
+          size_t                              rotary_embedding_dim,
+          int                                 start_id,
+          int                                 end_id,
+          unsigned long long                  random_seed,
+          NcclParam                           tensor_para,
+          NcclParam                           pipeline_para,
+          cudaStream_t                        stream,
+          cublasMMWrapper*                    cublas_wrapper,
+          IAllocator*                         allocator,
+          bool                                is_free_buffer_after_forward,
+          cudaDeviceProp*                     cuda_device_prop         = nullptr,
+          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+          int                                 enable_custom_all_reduce = 0);
 
     LLaMA(LLaMA<T> const& LLaMA);
 
@@ -178,11 +160,11 @@ class LLaMA: public BaseLayer {
 
     void forward(std::vector<Tensor>*       output_tensors,
                  const std::vector<Tensor>* input_tensors,
-                 const LLaMAWeight<T>*    llama_weights);
+                 const LLaMAWeight<T>*      llama_weights);
 
     void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
                  const std::unordered_map<std::string, Tensor>* input_tensors,
-                 const LLaMAWeight<T>*                        llama_weights);
+                 const LLaMAWeight<T>*                          llama_weights);
 
     size_t getPipelineParallelRank();
     size_t getPipelineParallelSize();
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index ecf127ae6..e8f4a4e21 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -21,7 +21,6 @@
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h"
 
-
 namespace fastertransformer {
 
 template<typename T>
@@ -42,8 +41,8 @@ void LLaMAContextDecoder<T>::initialize()
                                                             false,
                                                             0);
 
-    ffn_layer_ = new GeluFfnLayer<T>(0, // max_batch_size
-                                     1, 
+    ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
+                                     1,
                                      head_num_,
                                      size_per_head_,
                                      0,  // expert_num
@@ -55,7 +54,7 @@ void LLaMAContextDecoder<T>::initialize()
                                      false,
                                      0,
                                      false  // use_gated_activation = false
-                                     );
+    );
 }
 
 template<typename T>
@@ -128,21 +127,21 @@ int LLaMAContextDecoder<T>::getFirstLayerParallelId()
 
 template<typename T>
 LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t                              head_num,
-                                                size_t                              size_per_head,
-                                                size_t                              inter_size,
-                                                size_t                              num_layer,
-                                                size_t                              rotary_embedding_dim,
-                                                bool                                neox_rotary_style,
-                                                float                               layernorm_eps,
-                                                NcclParam                           pipeline_para,
-                                                cudaStream_t                        stream,
-                                                cublasMMWrapper*                    cublas_wrapper,
-                                                IAllocator*                         allocator,
-                                                bool                                is_free_buffer_after_forward,
-                                                bool                                is_qk_buf_float,
-                                                AttentionType                       attention_type,
-                                                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                                int                                 enable_custom_all_reduce):
+                                            size_t                              size_per_head,
+                                            size_t                              inter_size,
+                                            size_t                              num_layer,
+                                            size_t                              rotary_embedding_dim,
+                                            bool                                neox_rotary_style,
+                                            float                               layernorm_eps,
+                                            NcclParam                           pipeline_para,
+                                            cudaStream_t                        stream,
+                                            cublasMMWrapper*                    cublas_wrapper,
+                                            IAllocator*                         allocator,
+                                            bool                                is_free_buffer_after_forward,
+                                            bool                                is_qk_buf_float,
+                                            AttentionType                       attention_type,
+                                            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                                            int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -190,9 +189,9 @@ LLaMAContextDecoder<T>::~LLaMAContextDecoder()
 }
 
 template<typename T>
-void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
-                                       const std::vector<Tensor>*                        input_tensors,
-                                       const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                                     const std::vector<Tensor>*                      input_tensors,
+                                     const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
                                                               {"attention_mask", input_tensors->at(1)},
@@ -206,17 +205,14 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
 }
 
 template<typename T>
-void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
-                                       const std::unordered_map<std::string, Tensor>*    input_tensors,
-                                       const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                                     const std::unordered_map<std::string, Tensor>*  input_tensors,
+                                     const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     // input tensors:
     //      decoder_input [batch_size, seq_len, hidden_dimension],
     //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
     //      input_lengths [batch_size]
-    //      d_prefix_prompt_batch [batch_size],
-    //          each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head]
-    //      prefix_prompt_lengths [batch size]
 
     // output tensors:
     //      decoder_output [batch_size, seq_len, hidden_dimension],
@@ -228,7 +224,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() == 3);
     FT_CHECK(output_tensors->size() == 4);
 
     const int batch_size = input_tensors->at("decoder_input").shape[0];
@@ -238,13 +234,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const DataType data_type = getTensorType<T>();
     allocateBuffer(batch_size, seq_len);
 
-    T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
-    T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
-    const T*   attention_mask          = input_tensors->at("attention_mask").getPtr<const T>();
-    const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
-    const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
+    T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T*       decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+    const T* attention_mask = input_tensors->at("attention_mask").getPtr<const T>();
 
-    const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
+    // const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
+    const int local_batch_size = batch_size;
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
 
@@ -261,9 +256,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
-    AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
-                                        getUnfusedAttentionType(attention_type_) :
-                                        attention_type_;
+    AttentionType attention_type  = attention_type_;
     const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
 
     for (int ite = 0; ite < iteration_num; ite++) {
@@ -309,23 +302,19 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
             if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
                 int data_size = h_token_num * hidden_units_;
-                ftNcclRecv(layer_input,
-                           data_size,
-                           pipeline_para_.rank_ - 1,
-                           pipeline_para_,
-                           stream_);
+                std::cout << __FILE__ << ":" << __LINE__ << "\n";
+                std::cout << "Recv: " << layer_output << "," << data_size << "\n";
+                ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
             }
 
-            invokeGeneralLayerNorm(decoder_normed_input_,
-                                   layer_input,
-                                   llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                   llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
-                                   layernorm_eps_,
-                                   h_token_num,
-                                   hidden_units_,
-                                   (float*)nullptr,
-                                   0,
-                                   stream_);
+            invokeGeneralLLaMALayerNorm(decoder_normed_input_,
+                                        layer_input,
+                                        llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                        llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                        layernorm_eps_,
+                                        h_token_num,
+                                        hidden_units_,
+                                        stream_);
             sync_check_cuda_error();
 
             TensorMap self_attention_input_tensors{
@@ -339,19 +328,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
-            self_attention_input_tensors.insertIfValid(
-                "d_prefix_prompt_batch",
-                Tensor{MEMORY_GPU,
-                       data_type,
-                       {(size_t)local_batch_size},
-                       d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr});
-            self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths",
-                                                       Tensor{MEMORY_GPU,
-                                                              TYPE_INT32,
-                                                              {(size_t)local_batch_size},
-                                                              d_prefix_prompt_lengths != nullptr ?
-                                                                  d_prefix_prompt_lengths + ite * local_batch_size :
-                                                                  nullptr});
 
             if (is_unpadded_mha) {
                 self_attention_input_tensors.insert("padding_offset",
@@ -383,51 +359,48 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
             if (is_final == false) {
                 invokeGeneralAddBiasResidualPreLayerNorm(
-                        self_attn_output_,
-                        decoder_normed_input_,
-                        self_attn_output_,
-                        layer_input,
-                        llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                        llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                        llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
-                        layernorm_eps_,
-                        h_token_num,
-                        hidden_units_,
-                        (float*)nullptr,
-                        (float*)nullptr,
-                        (float*)nullptr,
-                        (float*)nullptr,
-                        0,
-                        stream_);
+                    self_attn_output_,
+                    decoder_normed_input_,
+                    self_attn_output_,
+                    layer_input,
+                    llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                    llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                    llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                    layernorm_eps_,
+                    h_token_num,
+                    hidden_units_,
+                    (float*)nullptr,
+                    (float*)nullptr,
+                    (float*)nullptr,
+                    (float*)nullptr,
+                    0,
+                    stream_);
 
                 TensorMap ffn_input_tensors(
                     {{"ffn_input",
                       Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
-                TensorMap ffn_output_tensors({{"ffn_output",
-                                               Tensor{MEMORY_GPU,
-                                                      data_type,
-                                                      {h_token_num, (size_t)hidden_units_},
-                                                      layer_output}}});
+                TensorMap ffn_output_tensors(
+                    {{"ffn_output",
+                      Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
                 ffn_layer_->forward(
                     &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
                 invokeAddBiasResidual(layer_output,
-                        self_attn_output_,
-                        llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                        h_token_num,
-                        hidden_units_,
-                        stream_);
+                                      self_attn_output_,
+                                      llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                      h_token_num,
+                                      hidden_units_,
+                                      stream_);
 
                 sync_check_cuda_error();
 
                 if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
                     && pipeline_para_.world_size_ > 1) {
                     int data_size = h_token_num * hidden_units_;
-                    ftNcclSend(layer_output,
-                               data_size,
-                               pipeline_para_.rank_ + 1,
-                               pipeline_para_,
-                               stream_);
+                    std::cout << __FILE__ << ":" << __LINE__ << "\n";
+                    std::cout << "Send: " << layer_output << "," << data_size << "\n";
+                    ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
+                    std::cout << __FILE__ << ":" << __LINE__ << "\n";
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index c9c474e49..115b3b06b 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -81,32 +81,32 @@ class LLaMAContextDecoder: public BaseLayer {
 
 public:
     LLaMAContextDecoder(size_t                              head_num,
-                          size_t                              size_per_head,
-                          size_t                              inter_size,
-                          size_t                              num_layer,
-                          size_t                              rotary_embedding_dim,
-                          bool                                neox_rotary_style,
-                          float                               layernorm_eps,
-                          NcclParam                           pipeline_para,
-                          cudaStream_t                        stream,
-                          cublasMMWrapper*                    cublas_wrapper,
-                          IAllocator*                         allocator,
-                          bool                                is_free_buffer_after_forward,
-                          bool                                is_qk_buf_float,
-                          AttentionType                       attention_type            = AttentionType::FUSED_MHA,
-                          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
-                          int                                 enable_custom_all_reduce_ = 0);
+                        size_t                              size_per_head,
+                        size_t                              inter_size,
+                        size_t                              num_layer,
+                        size_t                              rotary_embedding_dim,
+                        bool                                neox_rotary_style,
+                        float                               layernorm_eps,
+                        NcclParam                           pipeline_para,
+                        cudaStream_t                        stream,
+                        cublasMMWrapper*                    cublas_wrapper,
+                        IAllocator*                         allocator,
+                        bool                                is_free_buffer_after_forward,
+                        bool                                is_qk_buf_float,
+                        AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
+                        int                                 enable_custom_all_reduce_ = 0);
 
     LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
 
     ~LLaMAContextDecoder();
 
-    void forward(std::vector<Tensor>*                              output_tensors,
-                 const std::vector<Tensor>*                        input_tensors,
+    void forward(std::vector<Tensor>*                            output_tensors,
+                 const std::vector<Tensor>*                      input_tensors,
                  const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
 
-    void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
-                 const std::unordered_map<std::string, Tensor>*    input_tensors,
+    void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                 const std::unordered_map<std::string, Tensor>*  input_tensors,
                  const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight);
 };
 
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc
index 051744693..a98cd0159 100644
--- a/src/fastertransformer/models/llama/LLaMADecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoder.cc
@@ -38,8 +38,8 @@ void LLaMADecoder<T>::initialize()
                                                              false,
                                                              0);
 
-    ffn_layer_ = new GeluFfnLayer<T>(0, // max_batch_size
-                                     1, 
+    ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
+                                     1,
                                      head_num_,
                                      size_per_head_,
                                      0,  // expert_num
@@ -51,7 +51,7 @@ void LLaMADecoder<T>::initialize()
                                      false,
                                      0,
                                      false  // use_gated_activation = false
-                                     );
+    );
 }
 
 template<typename T>
@@ -117,19 +117,19 @@ int LLaMADecoder<T>::getFirstLayerParallelId()
 
 template<typename T>
 LLaMADecoder<T>::LLaMADecoder(size_t                              head_num,
-                                  size_t                              size_per_head,
-                                  size_t                              inter_size,
-                                  size_t                              num_layer,
-                                  size_t                              rotary_embedding_dim,
-                                  bool                                neox_rotary_style,
-                                  float                               layernorm_eps,
-                                  NcclParam                           pipeline_para,
-                                  cudaStream_t                        stream,
-                                  cublasMMWrapper*                    cublas_wrapper,
-                                  IAllocator*                         allocator,
-                                  bool                                is_free_buffer_after_forward,
-                                  std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                  int                                 enable_custom_all_reduce):
+                              size_t                              size_per_head,
+                              size_t                              inter_size,
+                              size_t                              num_layer,
+                              size_t                              rotary_embedding_dim,
+                              bool                                neox_rotary_style,
+                              float                               layernorm_eps,
+                              NcclParam                           pipeline_para,
+                              cudaStream_t                        stream,
+                              cublasMMWrapper*                    cublas_wrapper,
+                              IAllocator*                         allocator,
+                              bool                                is_free_buffer_after_forward,
+                              std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                              int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -173,17 +173,17 @@ LLaMADecoder<T>::~LLaMADecoder()
 }
 
 template<typename T>
-void LLaMADecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
-                                const std::vector<Tensor>*                        input_tensors,
-                                const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+void LLaMADecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                              const std::vector<Tensor>*                      input_tensors,
+                              const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     FT_CHECK(false);
 }
 
 template<typename T>
-void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
-                                const std::unordered_map<std::string, Tensor>*    input_tensors,
-                                const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                              const std::unordered_map<std::string, Tensor>*  input_tensors,
+                              const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     // input tensors:
     //      decoder_input [local_batch_size, hidden_dimension],
@@ -191,8 +191,6 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      sequence_lengths [local_batch_size]
     //      total_padding_tokens [local_batch_size],
     //      max_input_length [1] on cpu
-    //      d_prefix_prompt_lengths [local_batch_size], on GPU
-    //      max_prefix_prompt_length [1] on cpu
     //      step [1] on cpu
     //      ite [1] on cpu
     //      cache_indirection [local_batch_size / beam_width, beam_width, memory_len]
@@ -241,23 +239,17 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
             // stream_);
 
-            ftNcclRecv(layer_input,
-                       data_size,
-                       pipeline_para_.rank_ - 1,
-                       pipeline_para_,
-                       stream_);
+            ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
         }
 
-        invokeGeneralLayerNorm(decoder_normed_input_,
-                               layer_input,
-                               llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                               llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
-                               layernorm_eps_,
-                               local_batch_size,
-                               hidden_units_,
-                               (float*)nullptr,
-                               0,
-                               stream_);
+        invokeGeneralLLaMALayerNorm(decoder_normed_input_,
+                                    layer_input,
+                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                    layernorm_eps_,
+                                    local_batch_size,
+                                    hidden_units_,
+                                    stream_);
         sync_check_cuda_error();
 
         TensorMap self_attention_input_tensors(*input_tensors);
@@ -280,53 +272,46 @@ void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
         self_attention_layer_->forward(&self_attention_output_tensors,
-                &self_attention_input_tensors,
-                &llama_decoder_layer_weight->at(l)->self_attention_weights);
+                                       &self_attention_input_tensors,
+                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
         invokeGeneralAddBiasResidualPreLayerNorm(
-                self_attn_output_,
-                decoder_normed_input_,
-                self_attn_output_,
-                layer_input,
-                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
-                layernorm_eps_,
-                local_batch_size,
-                hidden_units_,
-                (float*)nullptr,
-                (float*)nullptr,
-                (float*)nullptr,
-                (float*)nullptr,
-                0,
-                stream_);
+            self_attn_output_,
+            decoder_normed_input_,
+            self_attn_output_,
+            layer_input,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+            llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+            layernorm_eps_,
+            local_batch_size,
+            hidden_units_,
+            (float*)nullptr,
+            (float*)nullptr,
+            (float*)nullptr,
+            (float*)nullptr,
+            0,
+            stream_);
 
         TensorMap ffn_input_tensors(
             {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}});
-        TensorMap ffn_output_tensors({{"ffn_output",
-                                       Tensor{MEMORY_GPU,
-                                              data_type,
-                                              {local_batch_size, hidden_units_},
-                                              layer_output}}});
+        TensorMap ffn_output_tensors(
+            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, layer_output}}});
         ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
         invokeAddBiasResidual(layer_output,
-                self_attn_output_,
-                llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                local_batch_size,
-                hidden_units_,
-                stream_);
+                              self_attn_output_,
+                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                              local_batch_size,
+                              hidden_units_,
+                              stream_);
 
         sync_check_cuda_error();
 
         if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             int data_size = local_batch_size * hidden_units_;
-            ftNcclSend(layer_output,
-                       data_size,
-                       pipeline_para_.rank_ + 1,
-                       pipeline_para_,
-                       stream_);
+            ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
         }
     }
 
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
index 412a1d076..3c40613fc 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -20,14 +20,8 @@
 namespace fastertransformer {
 
 template<typename T>
-LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const int  hidden_units,
-                                                        const int  inter_size,
-                                                        const int  tensor_para_size,
-                                                        const int  tensor_para_rank):
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size):
+    hidden_units_(hidden_units), inter_size_(inter_size)
 {
     mallocWeights();
     setWeightPtr();
@@ -37,7 +31,7 @@ template<typename T>
 LLaMADecoderLayerWeight<T>::~LLaMADecoderLayerWeight()
 {
     if (is_maintain_buffer == true) {
-        for (int i = 0; i < 12; i++) {
+        for (int i = 0; i < 14; i++) {
             if (i != attention_dense_bias_weight_id) {
                 cudaFree(weights_ptr[i]);
             }
@@ -62,50 +56,48 @@ LLaMADecoderLayerWeight<T>::~LLaMADecoderLayerWeight()
 
 template<typename T>
 LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_)
+    hidden_units_(other.hidden_units_), inter_size_(other.inter_size_)
 {
     mallocWeights();
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); nullptr;
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
+    //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
+    //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
     setWeightPtr();
 }
 
 template<typename T>
 LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADecoderLayerWeight& other)
 {
-    hidden_units_      = other.hidden_units_;
-    inter_size_        = other.inter_size_;
-    tensor_para_size_  = other.tensor_para_size_;
-    tensor_para_rank_  = other.tensor_para_rank_;
+    hidden_units_ = other.hidden_units_;
+    inter_size_   = other.inter_size_;
 
     mallocWeights();
 
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
+    //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
+    //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
+    //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
     setWeightPtr();
     return *this;
 }
@@ -114,85 +106,99 @@ template<typename T>
 void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
 {
     FT_CHECK(is_maintain_buffer == true);
-    const std::string rank_spec = std::to_string(tensor_para_rank_);
 
+//    loadWeightFromBin<T>(
+//        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
-        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
-    loadWeightFromBin<T>(weights_ptr[2],
-                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
-                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
-                         model_file_type);
+        weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr[3],
-                         {(size_t)(3 * hidden_units_ / tensor_para_size_)},
-                         dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin",
+    loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_)},
+                         dir_path + ".attention.query_key_value.weight.bin",
                          model_file_type);
+//    loadWeightFromBin<T>(weights_ptr[3],
+//                         {(size_t)(3 * hidden_units_)},
+//                         dir_path + ".attention.query_key_value.bias.bin",
+//                         model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[4],
-                         {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
-                         dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                         {(size_t)(hidden_units_), (size_t)hidden_units_},
+                         dir_path + ".attention.wo.weight.bin",
                          model_file_type);
-
-    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type);
+//    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[6],
-                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
-                         model_file_type);
-    loadWeightFromBin<T>(weights_ptr[7],
-                         {(size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin",
+                         {(size_t)hidden_units_, (size_t)(inter_size_)},
+                         dir_path + ".feed_forward.w1.weight.bin",
                          model_file_type);
+//    loadWeightFromBin<T>(
+//        weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type);
+
     loadWeightFromBin<T>(weights_ptr[8],
-                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
-                         dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
+                         {(size_t)(inter_size_), (size_t)hidden_units_},
+                         dir_path + ".feed_forward.w2.weight.bin",
                          model_file_type);
-    loadWeightFromBin<T>(
-            weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+//    loadWeightFromBin<T>(
+//        weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[10],
+                         {(size_t)hidden_units_, (size_t)(inter_size_)},
+                         dir_path + ".feed_forward.w3.weight.bin",
+                         model_file_type);
+//    loadWeightFromBin<T>(
+//        weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type);
+
+//    loadWeightFromBin<T>(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type);
 }
 
 template<typename T>
 void LLaMADecoderLayerWeight<T>::setWeightPtr()
 {
-    pre_layernorm_weights.beta                            = weights_ptr[0];
+    //pre_layernorm_weights.beta                            = weights_ptr[0];
+    pre_layernorm_weights.beta                            = nullptr;
     pre_layernorm_weights.gamma                           = weights_ptr[1];
     self_attention_weights.query_weight.kernel            = weights_ptr[2];
-    self_attention_weights.query_weight.bias              = weights_ptr[3];
+    //self_attention_weights.query_weight.bias              = weights_ptr[3];
+    self_attention_weights.query_weight.bias              = nullptr;
     self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
-    self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
+    //self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
+    self_attention_weights.attention_output_weight.bias   = nullptr;
 
     ffn_weights.intermediate_weight.kernel = weights_ptr[6];
-    ffn_weights.intermediate_weight.bias   = weights_ptr[7];
+    //ffn_weights.intermediate_weight.bias   = weights_ptr[7];
+    ffn_weights.intermediate_weight.bias   = nullptr;
     ffn_weights.output_weight.kernel       = weights_ptr[8];
-    ffn_weights.output_weight.bias         = weights_ptr[9];
-
-    post_attention_layernorm_weights.beta  = weights_ptr[10];
-    post_attention_layernorm_weights.gamma = weights_ptr[11];
+    //ffn_weights.output_weight.bias         = weights_ptr[9];
+    ffn_weights.output_weight.bias         = nullptr;
+    ffn_weights.gating_weight.kernel       = weights_ptr[10];
+    //ffn_weights.gating_weight.bias         = weights_ptr[11];
+    ffn_weights.gating_weight.bias         = nullptr;
+
+    //post_attention_layernorm_weights.beta  = weights_ptr[12];
+    post_attention_layernorm_weights.beta  = nullptr;
+    post_attention_layernorm_weights.gamma = weights_ptr[13];
     is_maintain_buffer                     = true;
 }
 
 template<typename T>
 void LLaMADecoderLayerWeight<T>::mallocWeights()
 {
-    deviceMalloc(&weights_ptr[0], hidden_units_);
+    //deviceMalloc(&weights_ptr[0], hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    deviceMalloc(&weights_ptr[5], hidden_units_);
-
-    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
-    deviceMalloc(&weights_ptr[9], hidden_units_);
-    deviceMalloc(&weights_ptr[10], hidden_units_);
-    deviceMalloc(&weights_ptr[11], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    //deviceMalloc(&weights_ptr[3], 3 * hidden_units_);
+    deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_);
+    //deviceMalloc(&weights_ptr[5], hidden_units_);
+
+    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_);
+    //deviceMalloc(&weights_ptr[7], inter_size_);
+    deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_);
+    //deviceMalloc(&weights_ptr[9], hidden_units_);
+    deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_);
+    //deviceMalloc(&weights_ptr[11], inter_size_);
+    //deviceMalloc(&weights_ptr[12], hidden_units_);
+    deviceMalloc(&weights_ptr[13], hidden_units_);
 }
 
 template struct LLaMADecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
index 4a6fc6a22..35d16300f 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
@@ -29,10 +29,7 @@ template<typename T>
 struct LLaMADecoderLayerWeight {
 public:
     LLaMADecoderLayerWeight() = default;
-    LLaMADecoderLayerWeight(const int  hidden_units,
-                              const int  inter_size,
-                              const int  tensor_para_size  = 1,
-                              const int  tensor_para_rank  = 0);
+    LLaMADecoderLayerWeight(const int hidden_units, const int inter_size);
     ~LLaMADecoderLayerWeight();
     LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other);
     LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other);
@@ -47,11 +44,9 @@ struct LLaMADecoderLayerWeight {
 private:
     int       hidden_units_;
     int       inter_size_;
-    int       tensor_para_size_;
-    int       tensor_para_rank_;
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
-    T*        weights_ptr[12];
+    T*        weights_ptr[14];
 
     void setWeightPtr();
     void mallocWeights();
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index f0bdc282f..81a22a51d 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -19,20 +19,16 @@
 namespace fastertransformer {
 
 template<typename T>
-LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_units,
-                                const int                                  inter_size,
-                                const int                                  vocab_size,
-                                const int                                  num_layer,
-                                const int                                  tensor_para_size,
-                                const int                                  tensor_para_rank,
-                                const int                                  layer_para_size,
-                                const int                                  layer_para_rank):
+LLaMAWeight<T>::LLaMAWeight(const int hidden_units,
+                            const int inter_size,
+                            const int vocab_size,
+                            const int num_layer,
+                            const int layer_para_size,
+                            const int layer_para_rank):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     vocab_size_(vocab_size),
     num_layer_(num_layer),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank),
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank)
 {
@@ -41,8 +37,7 @@ LLaMAWeight<T>::LLaMAWeight(const int                                  hidden_un
     decoder_layer_weights.reserve(num_layer_);
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
-            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_));
+            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(hidden_units_, inter_size_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -77,15 +72,13 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
     inter_size_(other.inter_size_),
     vocab_size_(other.vocab_size_),
     num_layer_(other.num_layer_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     prompt_token_weight_size_(other.prompt_token_weight_size_)
 {
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -102,19 +95,17 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
 template<typename T>
 LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
 {
-    hidden_units_               = other.hidden_units_;
-    inter_size_                 = other.inter_size_;
-    vocab_size_                 = other.vocab_size_;
-    num_layer_                  = other.num_layer_;
-    tensor_para_size_           = other.tensor_para_size_;
-    tensor_para_rank_           = other.tensor_para_rank_;
-    layer_para_size_            = other.layer_para_size_;
-    layer_para_rank_            = other.layer_para_rank_;
-    prompt_token_weight_size_   = other.prompt_token_weight_size_;
+    hidden_units_             = other.hidden_units_;
+    inter_size_               = other.inter_size_;
+    vocab_size_               = other.vocab_size_;
+    num_layer_                = other.num_layer_;
+    layer_para_size_          = other.layer_para_size_;
+    layer_para_rank_          = other.layer_para_rank_;
+    prompt_token_weight_size_ = other.prompt_token_weight_size_;
 
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -132,7 +123,8 @@ template<typename T>
 void LLaMAWeight<T>::setWeightPtr()
 {
     pre_decoder_embedding_table   = weights_ptr[0];
-    post_decoder_layernorm.beta   = weights_ptr[1];
+    //post_decoder_layernorm.beta   = weights_ptr[1];
+    post_decoder_layernorm.beta   = nullptr;
     post_decoder_layernorm.gamma  = weights_ptr[2];
     post_decoder_embedding.kernel = weights_ptr[3];
 }
@@ -143,7 +135,7 @@ void LLaMAWeight<T>::mallocWeights()
     weights_ptr.resize(num_base_weights);
 
     deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
-    deviceMalloc(&weights_ptr[1], hidden_units_);
+    //deviceMalloc(&weights_ptr[1], hidden_units_);
     deviceMalloc(&weights_ptr[2], hidden_units_);
     deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -156,16 +148,15 @@ void LLaMAWeight<T>::loadModel(std::string dir_path)
     FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama");
     FT_CHECK(is_maintain_buffer == true);
 
-    loadWeightFromBin<T>(
-        weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.bin", model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.bias.bin", model_file_type);
-
-    loadWeightFromBin<T>(
-        weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[0],
+                         {(size_t)(vocab_size_ * hidden_units_)},
+                         dir_path + "/model.tok_embeddings.weight.bin",
+                         model_file_type);
+    //loadWeightFromBin<T>(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[3],
                          {(size_t)(vocab_size_ * hidden_units_)},
-                         dir_path + "/model.lm_head.weight.bin",
+                         dir_path + "/model.output.weight.bin",
                          model_file_type);
 
     for (int l = 0; l < num_layer_; l++) {
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
index b372139e2..e1fed4309 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.h
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -32,8 +32,6 @@ struct LLaMAWeight {
         const int                                  inter_size,
         const int                                  vocab_size,
         const int                                  num_layer,
-        const int                                  tensor_para_size     = 1,
-        const int                                  tensor_para_rank     = 0,
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0);
 
@@ -62,8 +60,6 @@ struct LLaMAWeight {
     int vocab_size_;
     int num_layer_;
 
-    int tensor_para_size_;
-    int tensor_para_rank_;
     int layer_para_size_;
     int layer_para_rank_;
 

From dbe0657d5f504418261a0e84759dc95e98d45262 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 15 Sep 2023 08:55:49 +0000
Subject: [PATCH 09/55] remove debug code and bug fix

---
 src/fastertransformer/models/llama/LLaMA.cc               | 7 -------
 src/fastertransformer/models/llama/LLaMAContextDecoder.cc | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 9fcab580b..d03183a0a 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -402,11 +402,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     sync_check_cuda_error();
 
-    std::cout << __FILE__ << ":" << __LINE__ << "\n";
 
     // handle first step
     if (max_input_length > 1) {
-        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         invokeTileGptInputs(tiled_input_ids_buf_,
                             tiled_input_lengths_buf_,
                             input_tensors->at("input_ids").getPtr<int>(),
@@ -464,11 +462,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             {"last_token_hidden_units",
              Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
 
-        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         llama_context_decoder_->forward(
             &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
         sync_check_cuda_error();
-        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         invokeDecodingInitialize(finished_buf_,
                                  sequence_lengths_,
                                  nullptr,
@@ -526,7 +522,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         cudaMemcpyDeviceToDevice,
                         stream_);
     }
-    std::cout << __FILE__ << ":" << __LINE__ << "\n";
 
     invokeMaskPaddingTokens(masked_tokens_,
                             input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
@@ -539,7 +534,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             stream_);
 
     for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
-        std::cout << __FILE__ << ":" << __LINE__ << "\n";
         const int src_indir_idx = (step - max_input_length) % 2;
         const int tgt_indir_idx = 1 - src_indir_idx;
 
@@ -764,7 +758,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                      stream_);
         }
     }
-    std::cout << __FILE__ << ":" << __LINE__ << "\n";
 
     setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
     sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index e8f4a4e21..781338253 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -42,7 +42,7 @@ void LLaMAContextDecoder<T>::initialize()
                                                             0);
 
     ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
-                                     1,
+                                     0,
                                      head_num_,
                                      size_per_head_,
                                      0,  // expert_num

From 29c7b690d7973865036593b14aa49b9716d1b711 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 15 Sep 2023 16:24:35 +0000
Subject: [PATCH 10/55] only contextdecoder is necessary

---
 .../layers/attention_layers/CMakeLists.txt    |   5 +
 .../LLaMAContextAttentionLayer.cc             | 622 ++++++++++++++++++
 .../LLaMAContextAttentionLayer.h              | 131 ++++
 .../models/llama/CMakeLists.txt               |  20 +-
 src/fastertransformer/models/llama/LLaMA.cc   | 262 +-------
 src/fastertransformer/models/llama/LLaMA.h    |   4 -
 .../models/llama/LLaMAContextDecoder.cc       |  30 +-
 .../models/llama/LLaMADecoder.cc              | 326 ---------
 .../models/llama/LLaMADecoder.h               | 100 ---
 .../models/llama/LLaMADecoderLayerWeight.cc   |  93 ++-
 .../models/llama/LLaMAWeight.cc               |  10 +-
 11 files changed, 824 insertions(+), 779 deletions(-)
 create mode 100644 src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
 create mode 100644 src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
 delete mode 100644 src/fastertransformer/models/llama/LLaMADecoder.cc
 delete mode 100644 src/fastertransformer/models/llama/LLaMADecoder.h

diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
index 1f0e93b1b..13821892d 100644
--- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
@@ -44,6 +44,11 @@ set_property(TARGET GptContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE
 set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils)
 
+add_library(LLaMAContextAttentionLayer STATIC LLaMAContextAttentionLayer.cc)
+set_property(TARGET LLaMAContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils)
+
 add_library(DisentangledAttentionLayer STATIC DisentangledAttentionLayer.cc)
 set_property(TARGET DisentangledAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET DisentangledAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
new file mode 100644
index 000000000..38ec79b47
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_tensors,
+                                            TensorMap*                input_tensors,
+                                            const AttentionWeight<T>* attention_weights)
+{
+    // input_tensors:
+    //      input_query [token_num, hidden_dimension]
+    //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
+    //      attention_type [1]
+    //      is_final_layer [1], bool on cpu
+    //      layer_id [1], int on cpu
+    //      padding_offset, int, [token_num] (optional)
+    //      cu_seqlens, int, [batch_size] (optional)
+    //      d_prefix_prompt_batch [global_batch_size], (optional)
+    //          each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head]
+    //      d_prefix_prompt_lengths [batch_size], int (optional)
+    //      linear_bias_slopes [head_num] (optional)
+
+    // output_tensors:
+    //      hidden_features [token_num, hidden_dimension]
+    //      key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x]
+    //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(output_tensors->at("key_cache").shape.size() == 5);
+    FT_CHECK(output_tensors->at("value_cache").shape.size() == 4
+             || output_tensors->at("value_cache").shape.size() == 3);
+    const int request_batch_size = input_tensors->at("attention_mask").shape[0];
+    const int request_seq_len    = input_tensors->at("attention_mask").shape[2];
+    const int max_prompt_length =
+        input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
+    const int  layer_id                = input_tensors->getVal<int>("layer_id");
+    const T**  d_prefix_prompt_batch   = input_tensors->getPtr<const T*>("d_prefix_prompt_batch", nullptr);
+    const int* d_prefix_prompt_lengths = input_tensors->getPtr<int>("d_prefix_prompt_lengths", nullptr);
+    const int* padding_offset          = input_tensors->getPtr<int>("padding_offset", nullptr);
+    int*       cu_seqlens              = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    T*         linear_bias_slopes      = input_tensors->getPtr<T>("linear_bias_slopes", nullptr);
+    /* float*     attention_query_dynamic_scale = input_tensors->getPtr<float>("attention_query_dynamic_scale",
+     * nullptr); */
+
+    T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
+    T* attention_input = input_tensors->at("input_query").getPtr<T>();
+    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+
+    const AttentionType attention_type = input_tensors->getVal<AttentionType>("attention_type");
+    FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA,
+                       "LLaMA Context FUSED_PADDED_MHA is not supported !");
+
+    PUSH_RANGE("attention buffer alloc");
+    allocateBuffer(request_batch_size, request_seq_len + max_prompt_length, attention_type != AttentionType::FUSED_MHA);
+    POP_RANGE;
+    sync_check_cuda_error();
+
+    const bool is_final = input_tensors->at("is_final_layer").getVal<bool>();
+
+    const int m = input_tensors->at("input_query").shape[0];
+
+    PUSH_RANGE("qkv_gemm");
+
+#ifdef SPARSITY_ENABLED
+    const int m_padded   = 8 * div_up(m, 8);
+    bool      use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, 3 * local_hidden_units_, m_padded, hidden_units_);
+#else
+    constexpr bool use_sparse = false;
+#endif
+
+    if (use_sparse) {
+#ifdef SPARSITY_ENABLED
+        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                3 * local_hidden_units_,
+                                m_padded,
+                                hidden_units_,
+                                attention_weights->query_weight.sp_kernel,
+                                attention_input,
+                                qkv_buf_);
+#endif
+    }
+    else if (int8_mode_ == 1) {
+        FT_CHECK(weight_only_int8_fc_runner_.get() != NULL && attention_weights->query_weight.int8_kernel != NULL
+                 && attention_weights->query_weight.weight_only_quant_scale != NULL);
+
+        weight_only_int8_fc_runner_->gemm(attention_input,
+                                          reinterpret_cast<const uint8_t*>(attention_weights->query_weight.int8_kernel),
+                                          attention_weights->query_weight.weight_only_quant_scale,
+                                          qkv_buf_,
+                                          m,
+                                          3 * local_hidden_units_,
+                                          hidden_units_,
+                                          mixed_gemm_workspace_,
+                                          mixed_gemm_ws_bytes_,
+                                          stream_);
+    }
+    else if (int8_mode_ == 2) {
+        cublas_wrapper_->Int8Gemm(3 * local_hidden_units_,
+                                  m,
+                                  hidden_units_,
+                                  attention_weights->query_weight.int8_kernel,
+                                  hidden_units_,
+                                  input_tensors->at("input_query").getPtr<int8_t>(),
+                                  hidden_units_,
+                                  reinterpret_cast<int8_t*>(qkv_buf_),
+                                  3 * local_hidden_units_,
+                                  attention_weights->query_weight.scale_inter,
+                                  true);
+    }
+    else {
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              3 * local_hidden_units_,  // n
+                              m,
+                              hidden_units_,  // k
+                              attention_weights->query_weight.kernel,
+                              3 * local_hidden_units_,  // n
+                              attention_input,
+                              hidden_units_,  // k
+                              qkv_buf_,
+                              3 * local_hidden_units_ /* n */);
+    }
+
+    sync_check_cuda_error();
+
+    // IDEA: append prefix prompt key value here
+    PrefixPromptBatchWeightsParam<T> param{d_prefix_prompt_batch,
+                                           d_prefix_prompt_lengths,
+                                           max_prompt_length,
+                                           (size_t)layer_id * 2 * local_head_num_ * size_per_head_};
+
+    if (padding_offset != nullptr) {
+        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
+        cudaMemsetAsync(
+            q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_);
+    }
+    invokeAddFusedQKVBiasTranspose(q_buf_2_,
+                                   k_buf_2_,
+                                   v_buf_2_,
+                                   param,  // prefix prompt
+                                   qkv_buf_,
+                                   attention_weights->query_weight.bias,
+                                   padding_offset,
+                                   request_batch_size,
+                                   request_seq_len,
+                                   m,
+                                   local_head_num_,
+                                   size_per_head_,
+                                   rotary_embedding_dim_,
+                                   neox_rotary_style_,
+                                   attention_weights->query_weight.scale_out,
+                                   int8_mode_,
+                                   stream_);
+    sync_check_cuda_error();
+
+    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
+    // Use batch major
+    // put k/v_buf from shape [B, H, PL + L, Dh]
+    // to cache [B, H, Dh/x, PL + L, x]  and [B, H, PL + L, Dh/x, x], PL denotes prompt length
+    invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
+                                output_tensors->getPtr<T>("value_cache"),
+                                k_buf_2_,
+                                v_buf_2_,
+                                request_batch_size,
+                                max_prompt_length + request_seq_len,  // max input length + prefix prompt length
+                                max_seq_len,
+                                size_per_head_,
+                                local_head_num_,
+                                stream_);
+    // IDEA : after this, k_cache = (batch_size, num_heads, Dh/x, prefix_prompt_len + L, x)
+    // k_cache = (batch_size, num_heads, prefix_prompt_len + L, Dh)
+    sync_check_cuda_error();
+
+    // TODO: fmha kernels doesn't support different seq lengths of q and kv
+    if (attention_type == AttentionType::FUSED_MHA) {
+        dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size);
+        dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
+    }
+    // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh)
+
+    POP_RANGE;
+    if (is_final == false) {
+        const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
+        const int            attention_seq_len_1 = request_seq_len;                      // q length
+        const int            attention_seq_len_2 = max_prompt_length + request_seq_len;  // kv length
+        const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
+        if (attention_type != AttentionType::FUSED_MHA) {
+            if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
+                PUSH_RANGE("Q*K batch gemm");
+                cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                                    CUBLAS_OP_N,
+                                                    attention_seq_len_2,  // n
+                                                    attention_seq_len_1,  // m
+                                                    size_per_head_,       // k
+                                                    1.0f,
+                                                    k_buf_2_,
+                                                    gemm_data_type,
+                                                    size_per_head_,                        // k
+                                                    attention_seq_len_2 * size_per_head_,  // n * k
+                                                    q_buf_2_,
+                                                    gemm_data_type,
+                                                    size_per_head_,                        // k
+                                                    attention_seq_len_1 * size_per_head_,  // m * k
+                                                    0.0f,
+                                                    qk_buf_float_,
+                                                    CUDA_R_32F,
+                                                    attention_seq_len_2,  // n
+                                                    attention_seq_len_2 * attention_seq_len_1,
+                                                    request_batch_size * local_head_num_,  // global batch size
+                                                    CUDA_R_32F);
+
+                sync_check_cuda_error();
+                POP_RANGE;
+
+                PUSH_RANGE("softmax");
+                MaskedSoftmaxParam<T, float> param;
+                param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+                param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
+                param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+                param.batch_size         = request_batch_size;
+                param.q_length           = attention_seq_len_1;
+                param.k_length           = attention_seq_len_2;
+                param.num_heads          = local_head_num_;
+                param.qk_scale           = qk_scale;
+                param.linear_bias_slopes = const_cast<T*>(linear_bias_slopes);  // (head_num,), optional
+                invokeMaskedSoftmax(param, stream_);
+                sync_check_cuda_error();
+                POP_RANGE;
+            }
+            else {
+                PUSH_RANGE("Q*K batch gemm");
+                cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                                    CUBLAS_OP_N,
+                                                    attention_seq_len_2,
+                                                    attention_seq_len_1,
+                                                    size_per_head_,
+                                                    k_buf_2_,
+                                                    size_per_head_,
+                                                    attention_seq_len_2 * size_per_head_,
+                                                    q_buf_2_,
+                                                    size_per_head_,
+                                                    attention_seq_len_1 * size_per_head_,
+                                                    qk_buf_,
+                                                    attention_seq_len_2,
+                                                    attention_seq_len_2 * attention_seq_len_1,
+                                                    request_batch_size * local_head_num_);
+
+                POP_RANGE;
+                PUSH_RANGE("softmax");
+                MaskedSoftmaxParam<T, T> param;
+                param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+                param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+                param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+                param.batch_size         = request_batch_size;
+                param.q_length           = attention_seq_len_1;
+                param.k_length           = attention_seq_len_2;
+                param.num_heads          = local_head_num_;
+                param.qk_scale           = qk_scale;
+                param.linear_bias_slopes = const_cast<T*>(linear_bias_slopes);  // (head_num,), optional
+                invokeMaskedSoftmax(param, stream_);
+                sync_check_cuda_error();
+                POP_RANGE;
+            }
+
+            PUSH_RANGE("QK*V batch gemm");
+
+            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                                CUBLAS_OP_N,
+                                                size_per_head_,
+                                                attention_seq_len_1,
+                                                attention_seq_len_2,
+                                                v_buf_2_,
+                                                size_per_head_,
+                                                attention_seq_len_2 * size_per_head_,
+                                                qk_buf_,
+                                                attention_seq_len_2,
+                                                attention_seq_len_1 * attention_seq_len_2,
+                                                qkv_buf_2_,
+                                                size_per_head_,
+                                                attention_seq_len_1 * size_per_head_,
+                                                request_batch_size * local_head_num_);
+
+            // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
+            if (padding_offset == nullptr) {
+                invokeTransposeQKV(qkv_buf_3_,
+                                   qkv_buf_2_,
+                                   request_batch_size,
+                                   attention_seq_len_1,
+                                   local_head_num_,
+                                   size_per_head_,
+                                   attention_weights->attention_output_weight.scale,
+                                   int8_mode_,
+                                   stream_);
+                sync_check_cuda_error();
+            }
+            else {
+                invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                                         qkv_buf_3_,
+                                                         m,
+                                                         request_batch_size,
+                                                         attention_seq_len_1,
+                                                         local_head_num_,
+                                                         size_per_head_,
+                                                         padding_offset,
+                                                         attention_weights->attention_output_weight.scale,
+                                                         int8_mode_,
+                                                         stream_);
+            }
+            POP_RANGE;
+        }
+        sync_check_cuda_error();
+
+        PUSH_RANGE("proj gemm");
+#ifdef SPARSITY_ENABLED
+        bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m_padded, local_hidden_units_);
+#endif
+
+        if (use_sparse) {
+#ifdef SPARSITY_ENABLED
+            cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                    CUBLAS_OP_N,
+                                    hidden_units_,
+                                    m_padded,
+                                    local_hidden_units_,
+                                    attention_weights->attention_output_weight.sp_kernel,
+                                    qkv_buf_3_,
+                                    attention_out);
+#endif
+        }
+        else {
+            if (int8_mode_ == 1) {
+                FT_CHECK(weight_only_int8_fc_runner_.get() != NULL
+                         && attention_weights->attention_output_weight.int8_kernel != NULL
+                         && attention_weights->attention_output_weight.weight_only_quant_scale != NULL);
+
+                weight_only_int8_fc_runner_->gemm(
+                    qkv_buf_3_,
+                    reinterpret_cast<const uint8_t*>(attention_weights->attention_output_weight.int8_kernel),
+                    attention_weights->attention_output_weight.weight_only_quant_scale,
+                    attention_out,
+                    m,
+                    hidden_units_,
+                    local_hidden_units_,
+                    mixed_gemm_workspace_,
+                    mixed_gemm_ws_bytes_,
+                    stream_);
+            }
+            else if (int8_mode_ == 2) {
+                int8_fc_runner_->gemm(reinterpret_cast<int8_t*>(qkv_buf_3_),
+                                      attention_weights->attention_output_weight.int8_kernel,
+                                      QuantMode::PerTensorQuant,
+                                      attention_weights->attention_output_weight.scale_inter,
+                                      attention_weights->attention_output_weight.scale_out,
+                                      output_tensors->at("hidden_features").getPtr<T>(),
+                                      m,
+                                      hidden_units_,
+                                      local_hidden_units_,
+                                      nullptr,
+                                      0,
+                                      stream_);
+            }
+            else {
+                cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      hidden_units_,
+                                      m,
+                                      local_hidden_units_,
+                                      attention_weights->attention_output_weight.kernel,
+                                      hidden_units_,
+                                      qkv_buf_3_,
+                                      local_hidden_units_,
+                                      attention_out,
+                                      hidden_units_);
+            }
+        }
+        POP_RANGE;
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
+                                                          size_t           max_seq_len,
+                                                          size_t           head_num,
+                                                          size_t           size_per_head,
+                                                          cudaStream_t     stream,
+                                                          cublasMMWrapper* cublas_wrapper,
+                                                          IAllocator*      allocator,
+                                                          bool             is_free_buffer_after_forward,
+                                                          bool             is_qk_buf_float,
+                                                          bool             sparse,
+                                                          int              int8_mode):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+    max_batch_size_(max_batch_size),
+    max_seq_len_(max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(head_num),
+    local_hidden_units_(local_head_num_ * size_per_head),
+    rotary_embedding_dim_(0),
+    neox_rotary_style_(false),
+    is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
+    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
+    int8_mode_(int8_mode)
+{
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
+                                                          size_t           max_seq_len,
+                                                          size_t           head_num,
+                                                          size_t           size_per_head,
+                                                          size_t           local_head_num,
+                                                          cudaStream_t     stream,
+                                                          cublasMMWrapper* cublas_wrapper,
+                                                          IAllocator*      allocator,
+                                                          bool             is_free_buffer_after_forward,
+                                                          bool             is_qk_buf_float,
+                                                          bool             sparse,
+                                                          int              int8_mode):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+    max_batch_size_(max_batch_size),
+    max_seq_len_(max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(local_head_num),
+    local_hidden_units_(local_head_num_ * size_per_head),
+    rotary_embedding_dim_(0),
+    neox_rotary_style_(false),
+    is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
+    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
+    int8_mode_(int8_mode)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f));
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
+                                                          size_t           max_seq_len,
+                                                          size_t           head_num,
+                                                          size_t           size_per_head,
+                                                          size_t           local_head_num,
+                                                          size_t           rotary_embedding_dim,
+                                                          bool             neox_rotary_style,
+                                                          cudaStream_t     stream,
+                                                          cublasMMWrapper* cublas_wrapper,
+                                                          IAllocator*      allocator,
+                                                          bool             is_free_buffer_after_forward,
+                                                          bool             is_qk_buf_float,
+                                                          bool             sparse,
+                                                          int              int8_mode):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+    max_batch_size_(max_batch_size),
+    max_seq_len_(max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(local_head_num),
+    local_hidden_units_(local_head_num_ * size_per_head),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    neox_rotary_style_(neox_rotary_style),
+    is_qk_buf_float_(is_qk_buf_float),
+    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
+    int8_mode_(int8_mode)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f));
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer):
+    BaseAttentionLayer<T>(attention_layer.stream_,
+                          attention_layer.cublas_wrapper_,
+                          attention_layer.allocator_,
+                          attention_layer.is_free_buffer_after_forward_,
+                          attention_layer.sparse_),
+    max_batch_size_(attention_layer.max_batch_size_),
+    max_seq_len_(attention_layer.max_seq_len_),
+    head_num_(attention_layer.head_num_),
+    size_per_head_(attention_layer.size_per_head_),
+    hidden_units_(attention_layer.hidden_units_),
+    local_head_num_(attention_layer.local_head_num_),
+    local_hidden_units_(attention_layer.local_hidden_units_),
+    rotary_embedding_dim_(attention_layer.rotary_embedding_dim_),
+    neox_rotary_style_(attention_layer.neox_rotary_style_),
+    is_qk_buf_float_(attention_layer.is_qk_buf_float_),
+    weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_),
+    int8_fc_runner_(attention_layer.int8_fc_runner_),
+    int8_mode_(attention_layer.int8_mode_)
+{
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::~LLaMAContextAttentionLayer()
+{
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
+    // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels.
+    const auto type_size = sizeof(T);
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true);
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true);
+    k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_;
+
+    // save memory usage when using fmha
+    if (allocate_qk_buf) {
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len, true);
+    }
+    else {
+        allocator_->free((void**)(&qk_buf_));
+    }
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * local_hidden_units_, true);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, type_size * batch_size * seq_len * local_hidden_units_, true);
+
+    if (is_qk_buf_float_ == true) {
+        if (allocate_qk_buf) {
+            qk_buf_float_ = (float*)allocator_->reMalloc(
+                qk_buf_float_, sizeof(float) * batch_size * local_head_num_ * seq_len * seq_len, true);
+        }
+        else {
+            allocator_->free((void**)(&qk_buf_float_));
+        }
+    }
+
+    if (int8_mode_ == 1) {
+        // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
+        // possible memory that would be required by any of the individual gemms.
+        const int max_size    = std::max(hidden_units_, 3 * local_hidden_units_);
+        mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
+        mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
+    }
+
+    if (int8_mode_ == 1) {
+        // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
+        // possible memory that would be required by any of the individual gemms.
+        const int max_size    = std::max(hidden_units_, 3 * local_hidden_units_);
+        mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
+        mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
+    }
+    else if (int8_mode_ == 2) {
+        const int max_size   = std::max(hidden_units_, 3 * local_hidden_units_);
+        int8_gemm_ws_bytes_  = int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
+        int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false);
+    }
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&q_buf_2_));
+        allocator_->free((void**)(&qk_buf_));
+        allocator_->free((void**)(&qkv_buf_2_));
+        allocator_->free((void**)(&qkv_buf_3_));
+
+        if (is_qk_buf_float_ == true) {
+            allocator_->free((void**)(&qk_buf_float_));
+        }
+
+        allocator_->free((void**)(&mixed_gemm_workspace_));
+        mixed_gemm_ws_bytes_ = 0;
+
+        allocator_->free((void**)(&int8_gemm_workspace_));
+        int8_gemm_ws_bytes_ = 0;
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+template class LLaMAContextAttentionLayer<float>;
+template class LLaMAContextAttentionLayer<half>;
+#ifdef ENABLE_BF16
+template class LLaMAContextAttentionLayer<__nv_bfloat16>;
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
new file mode 100644
index 000000000..6a18d734e
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "3rdparty/trt_fused_multihead_attention/qkvToContext.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
+private:
+    // buffer handling
+    size_t max_batch_size_ = 0;
+    size_t max_seq_len_    = 0;
+
+    // metadata
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t local_head_num_;
+    const size_t local_hidden_units_;
+    const size_t rotary_embedding_dim_;
+    const bool   neox_rotary_style_;
+
+    // fmha runner
+    int                        sm_ = getSMVersion();
+    std::unique_ptr<MHARunner> dispatcher_fp16;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf);
+    void freeBuffer() override;
+
+    using BaseAttentionLayer<T>::is_free_buffer_after_forward_;
+    using BaseAttentionLayer<T>::is_allocate_buffer_;
+    using BaseAttentionLayer<T>::cublas_wrapper_;
+
+    bool is_qk_buf_float_;
+
+    std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
+    std::shared_ptr<CutlassInt8GemmRunner<T>>             int8_fc_runner_;
+
+protected:
+    using BaseAttentionLayer<T>::allocator_;
+    using BaseAttentionLayer<T>::stream_;
+    using BaseAttentionLayer<T>::sparse_;
+    T*     qkv_buf_              = nullptr;
+    T*     q_buf_2_              = nullptr;
+    T*     k_buf_2_              = nullptr;
+    T*     v_buf_2_              = nullptr;
+    T*     qk_buf_               = nullptr;
+    float* qk_buf_float_         = nullptr;
+    T*     qkv_buf_2_            = nullptr;
+    T*     qkv_buf_3_            = nullptr;
+    char*  mixed_gemm_workspace_ = nullptr;
+    size_t mixed_gemm_ws_bytes_  = 0;
+    char*  int8_gemm_workspace_  = nullptr;
+    size_t int8_gemm_ws_bytes_   = 0;
+
+    // int8_mode_ == 0 means we don't use any mechanism related to INT8.
+    // int8_mode_ == 1 for weight quantized only gemm for GPT
+    // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
+    const int int8_mode_ = 0;
+
+public:
+    LLaMAContextAttentionLayer(size_t           max_batch_size,
+                             size_t           max_seq_len,
+                             size_t           head_num,
+                             size_t           size_per_head,
+                             cudaStream_t     stream,
+                             cublasMMWrapper* cublas_wrapper,
+                             IAllocator*      allocator,
+                             bool             is_free_buffer_after_forward,
+                             bool             is_qk_buf_float,
+                             bool             sparse    = false,
+                             int              int8_mode = 0);
+
+    LLaMAContextAttentionLayer(size_t           max_batch_size,
+                             size_t           max_seq_len,
+                             size_t           head_num,
+                             size_t           size_per_head,
+                             size_t           local_head_num,
+                             cudaStream_t     stream,
+                             cublasMMWrapper* cublas_wrapper,
+                             IAllocator*      allocator,
+                             bool             is_free_buffer_after_forward,
+                             bool             is_qk_buf_float,
+                             bool             sparse    = false,
+                             int              int8_mode = 0);
+
+    LLaMAContextAttentionLayer(size_t           max_batch_size,
+                             size_t           max_seq_len,
+                             size_t           head_num,
+                             size_t           size_per_head,
+                             size_t           local_head_num,
+                             size_t           rotary_embedding_dim,
+                             bool             neox_rotary_style_,
+                             cudaStream_t     stream,
+                             cublasMMWrapper* cublas_wrapper,
+                             IAllocator*      allocator,
+                             bool             is_free_buffer_after_forward,
+                             bool             is_qk_buf_float,
+                             bool             sparse    = false,
+                             int              int8_mode = 0);
+
+    LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer);
+
+    virtual ~LLaMAContextAttentionLayer();
+
+    void
+    forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
index da314ec7d..0c5106f00 100644
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -19,26 +19,12 @@ set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE
 set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger)
 
-add_library(LLaMADecoder STATIC LLaMADecoder.cc)
-set_property(TARGET LLaMADecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET LLaMADecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(LLaMADecoder PUBLIC -lcudart cublasMMWrapper
-                      TensorParallelDecoderSelfAttentionLayer
-                      TensorParallelGeluFfnLayer
-                      layernorm_kernels
-                      add_residual_kernels
-                      LLaMADecoderLayerWeight
-                      tensor
-                      nccl_utils
-                      cuda_utils
-                      logger)
-
 add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc)
 set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper
-                      TensorParallelGptContextAttentionLayer
-                      TensorParallelGeluFfnLayer
+                      LLaMAContextAttentionLayer
+                      FfnLayer
                       layernorm_kernels
                       add_residual_kernels
                       gpt_kernels
@@ -56,11 +42,9 @@ add_library(LLaMA STATIC LLaMA.cc)
 set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LLaMA PUBLIC -lcudart
-                      LLaMADecoder
                       LLaMAContextDecoder
                       decoding_kernels
                       gpt_kernels
-                      DynamicDecodeLayer
                       BaseBeamSearchLayer
                       bert_preprocess_kernels
                       tensor
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index d03183a0a..fb8eb4f9c 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -42,30 +42,6 @@ void LLaMA<T>::initialize()
                                                         attention_type_,
                                                         custom_all_reduce_comm_,
                                                         enable_custom_all_reduce_);
-
-    llama_decoder_ = new LLaMADecoder<T>(head_num_,
-                                         size_per_head_,
-                                         inter_size_,
-                                         num_layer_,
-                                         rotary_embedding_dim_,
-                                         neox_rotary_style_,
-                                         layernorm_eps_,
-                                         pipeline_para_,
-                                         stream_,
-                                         cublas_wrapper_,
-                                         allocator_,
-                                         is_free_buffer_after_forward_,
-                                         custom_all_reduce_comm_,
-                                         enable_custom_all_reduce_);
-
-    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
-                                                          vocab_size_,
-                                                          0,  // end_id, deprecated
-                                                          stream_,
-                                                          cublas_wrapper_,
-                                                          allocator_,
-                                                          is_free_buffer_after_forward_,
-                                                          cuda_device_prop_);
 }
 
 template<typename T>
@@ -273,8 +249,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
 template<typename T>
 LLaMA<T>::~LLaMA()
 {
-    delete llama_decoder_;
-    delete dynamic_decode_layer_;
     delete llama_context_decoder_;
     freeBuffer();
 }
@@ -373,15 +347,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
 
     sync_check_cuda_error();
-    {
-        TensorMap input_map(*input_tensors);
-        dynamic_decode_layer_->setup(batch_size, beam_width, &input_map);
-        handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size);
-        handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size);
-    }
-
-    const DataType data_type = getTensorType<T>();
 
+    const DataType            data_type          = getTensorType<T>();
     const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
                                                     batch_size * beam_width,
                                                     head_num_,
@@ -402,7 +369,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     sync_check_cuda_error();
 
-
     // handle first step
     if (max_input_length > 1) {
         invokeTileGptInputs(tiled_input_ids_buf_,
@@ -533,232 +499,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                             beam_width,
                             stream_);
 
-    for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
-        const int src_indir_idx = (step - max_input_length) % 2;
-        const int tgt_indir_idx = 1 - src_indir_idx;
-
-        const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_);
-        FT_CHECK(batch_size % local_batch_size == 0);
-        const size_t iteration_num = batch_size / local_batch_size;
-        *generation_should_stop_   = true;
-
-        for (uint ite = 0; ite < iteration_num; ++ite) {
-            const int id_offset               = ite * local_batch_size * beam_width;
-            const int hidden_units_offset     = id_offset * hidden_units_;
-            const int vocab_size_units_offset = id_offset * vocab_size_;
-
-            if (!(max_input_length > 1 && step == max_input_length)) {
-                if (pipeline_para_.rank_ == 0) {
-                    invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset,
-                                                             llama_weights->pre_decoder_embedding_table,
-                                                             llama_weights->position_encoding_table,
-                                                             output_ids_buf_ + id_offset,
-                                                             tiled_total_padding_count_ + id_offset,
-                                                             local_batch_size * beam_width,
-                                                             hidden_units_,
-                                                             (T)(1.0f),
-                                                             step - 1,
-                                                             batch_size * beam_width,
-                                                             0,
-                                                             stream_);
-                    sync_check_cuda_error();
-                }
-                std::unordered_map<std::string, Tensor> decoder_input_tensors{
-                    {"decoder_input",
-                     Tensor{MEMORY_GPU,
-                            data_type,
-                            {local_batch_size * beam_width, hidden_units_},
-                            decoder_input_buf_ + hidden_units_offset}},
-                    {"finished",
-                     Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}},
-                    {"sequence_lengths",
-                     Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}},
-                    {"total_padding_tokens",
-                     Tensor{MEMORY_GPU,
-                            TYPE_INT32,
-                            {local_batch_size * beam_width},
-                            tiled_total_padding_count_ + id_offset}},
-                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
-                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
-                    {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}},
-                    {"cache_indirection",
-                     Tensor{MEMORY_GPU,
-                            TYPE_INT32,
-                            {local_batch_size, beam_width, max_output_seq_len},
-                            beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len :
-                                             nullptr}},
-                    {"masked_tokens",
-                     Tensor{MEMORY_GPU,
-                            TYPE_BOOL,
-                            {local_batch_size * beam_width, max_cache_seq_len},
-                            masked_tokens_ + id_offset * max_cache_seq_len}}};
-                std::unordered_map<std::string, Tensor> decoder_output_tensors{
-                    {"decoder_output",
-                     Tensor{MEMORY_GPU,
-                            data_type,
-                            {local_batch_size * beam_width, hidden_units_},
-                            decoder_output_buf_ + hidden_units_offset}},
-                    {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
-                    {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
-                llama_decoder_->forward(
-                    &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
-            }
-
-            if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
-                invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
-                                            decoder_output_buf_ + hidden_units_offset,
-                                            llama_weights->post_decoder_layernorm.gamma,
-                                            llama_weights->post_decoder_layernorm.beta,
-                                            layernorm_eps_,
-                                            local_batch_size * beam_width,
-                                            hidden_units_,
-                                            stream_);
-                sync_check_cuda_error();
-
-                float alpha = 1.0f;
-                float beta  = 0.0f;
-                cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                      CUBLAS_OP_N,
-                                      vocab_size_,
-                                      local_batch_size * beam_width,
-                                      hidden_units_,  // k
-                                      &alpha,
-                                      llama_weights->post_decoder_embedding.kernel,
-                                      gemm_data_type,
-                                      hidden_units_,  // k
-                                      normed_decoder_output_buf_ + hidden_units_offset,
-                                      gemm_data_type,
-                                      hidden_units_,  // k
-                                      &beta,
-                                      logits_buf_ + vocab_size_units_offset,
-                                      CUDA_R_32F,
-                                      vocab_size_,
-                                      CUDA_R_32F,
-                                      cublasGemmAlgo_t(-1));
-
-                int                                     tmp_local_batch_size       = local_batch_size;
-                bool                                    is_initialize_random_table = step == max_input_length;
-                std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
-                    {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_}, logits_buf_}},
-                    // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_}, nullptr}},
-                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
-                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
-                    {"input_lengths",
-                     Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}},
-                    {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}},
-                    {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
-                    {"src_cache_indirection",
-                     Tensor{MEMORY_GPU,
-                            TYPE_INT32,
-                            {local_batch_size, beam_width, max_output_seq_len},
-                            cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}},
-                    {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}},
-                    {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}},
-                    {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}};
-
-                for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
-                    if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) {
-                        dynamic_decode_input_tensors.insert(*t);
-                    }
-                }
-
-                // common outputs
-                bool                                    subbatch_should_stop = false;
-                std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
-                    {"output_ids",
-                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}},
-                    {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}},
-                    // cum_log_probs is necessary for beam search, while it is optional for sampling.
-                    {"cum_log_probs",
-                     Tensor{MEMORY_GPU,
-                            TYPE_FP32,
-                            {batch_size * beam_width},
-                            ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ :
-                                                                                                 nullptr}},
-                    {"output_log_probs",
-                     Tensor{MEMORY_GPU,
-                            TYPE_FP32,
-                            {max_seq_len, batch_size, beam_width},
-                            output_tensors->count("output_log_probs") > 0
-                                    && output_tensors->at("output_log_probs").data != nullptr ?
-                                output_log_probs_buf_ :
-                                nullptr}},
-                    {"parent_ids",
-                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}},
-                    {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}},
-                    {"tgt_cache_indirection",
-                     Tensor{MEMORY_GPU,
-                            TYPE_INT32,
-                            {local_batch_size, beam_width, max_output_seq_len},
-                            cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}},
-                    {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}};
-
-                for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) {
-                    // Handle exceptions.
-                    if (t->first == "cum_log_probs" || t->first == "output_log_probs") {
-                        continue;
-                    }
-                    dynamic_decode_output_tensors.insert(*t);
-                }
-
-                dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-                *generation_should_stop_ &= subbatch_should_stop;
-            }
-        }
-
-        if (pipeline_para_.world_size_ > 1) {
-            ftNcclGroupStart();
-            ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width,
-                            batch_size * beam_width,
-                            pipeline_para_.world_size_ - 1,
-                            pipeline_para_,
-                            stream_);
-
-            ftNcclBroadCast(
-                sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
-
-            ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
-
-            if (beam_width > 1) {
-                ftNcclBroadCast(cache_indirections_[tgt_indir_idx],
-                                batch_size * beam_width * max_output_seq_len,
-                                pipeline_para_.world_size_ - 1,
-                                pipeline_para_,
-                                stream_);
-            }
-            ftNcclGroupEnd();
-            // throw errors when detected
-            NcclParam tensor_para(0, 1);
-            ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
-            sync_check_cuda_error();
-        }
-
-        if (*generation_should_stop_) {
-            break;
-        }
-        if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) {
-            setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
-            sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
-
-            if (pipeline_para_.rank_ == 0) {
-                token_generated_cb_(output_tensors, token_generated_ctx_);
-            }
-        }
-        if (step == max_input_length) {
-            /* We have just finished processing input: update the padding count:
-             * total_padding_count += (max_input_length - input_lengths)
-             */
-            invokeUpdatePaddingCount(tiled_total_padding_count_,
-                                     input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                                     (const int*)nullptr,
-                                     max_input_length,
-                                     0,
-                                     batch_size,
-                                     beam_width,
-                                     stream_);
-        }
-    }
-
     setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
     sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
 }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 7a66a2ebf..303236b72 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -19,9 +19,7 @@
 #include <cstddef>
 #include <vector>
 
-#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
 #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
-#include "src/fastertransformer/models/llama/LLaMADecoder.h"
 #include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 
@@ -56,9 +54,7 @@ class LLaMA: public BaseLayer {
     const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
                                            || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
-    LLaMADecoder<T>*           llama_decoder_;
     LLaMAContextDecoder<T>*    llama_context_decoder_;
-    DynamicDecodeLayer<float>* dynamic_decode_layer_;
 
     void allocateBuffer() override;
     void allocateBuffer(
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 781338253..119c98041 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -19,27 +19,27 @@
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 
 #include "src/fastertransformer/layers/FfnLayer.h"
-#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h"
+#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
 
 namespace fastertransformer {
 
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
-    self_attention_layer_ = new GptContextAttentionLayer<T>(0,  // max_batch_size
-                                                            0,  // max_seq_len
-                                                            head_num_,
-                                                            size_per_head_,
-                                                            head_num_,
-                                                            rotary_embedding_dim_,
-                                                            neox_rotary_style_,
-                                                            stream_,
-                                                            cublas_wrapper_,
-                                                            allocator_,
-                                                            is_free_buffer_after_forward_,
-                                                            is_qk_buf_float_,
-                                                            false,
-                                                            0);
+    self_attention_layer_ = new LLaMAContextAttentionLayer<T>(0,  // max_batch_size
+                                                              0,  // max_seq_len
+                                                              head_num_,
+                                                              size_per_head_,
+                                                              head_num_,
+                                                              rotary_embedding_dim_,
+                                                              neox_rotary_style_,
+                                                              stream_,
+                                                              cublas_wrapper_,
+                                                              allocator_,
+                                                              is_free_buffer_after_forward_,
+                                                              is_qk_buf_float_,
+                                                              false,
+                                                              0);
 
     ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
                                      0,
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.cc b/src/fastertransformer/models/llama/LLaMADecoder.cc
deleted file mode 100644
index a98cd0159..000000000
--- a/src/fastertransformer/models/llama/LLaMADecoder.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/fastertransformer/models/llama/LLaMADecoder.h"
-#include "src/fastertransformer/layers/FfnLayer.h"
-#include "src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h"
-
-namespace fastertransformer {
-
-template<typename T>
-void LLaMADecoder<T>::initialize()
-{
-    self_attention_layer_ = new DecoderSelfAttentionLayer<T>(0,  // max_batch_size
-                                                             head_num_,
-                                                             size_per_head_,
-                                                             head_num_,
-                                                             rotary_embedding_dim_,
-                                                             neox_rotary_style_,
-                                                             head_num_ * size_per_head_,
-                                                             1.0f,
-                                                             stream_,
-                                                             cublas_wrapper_,
-                                                             allocator_,
-                                                             is_free_buffer_after_forward_,
-                                                             false,
-                                                             0);
-
-    ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
-                                     1,
-                                     head_num_,
-                                     size_per_head_,
-                                     0,  // expert_num
-                                     inter_size_,
-                                     stream_,
-                                     cublas_wrapper_,
-                                     allocator_,
-                                     is_free_buffer_after_forward_,
-                                     false,
-                                     0,
-                                     false  // use_gated_activation = false
-    );
-}
-
-template<typename T>
-void LLaMADecoder<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LLaMADecoder<T>::allocateBuffer(size_t batch_size)
-{
-    decoder_normed_input_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false));
-    self_attn_output_ =
-        reinterpret_cast<T*>(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false));
-    ffn_output_ =
-        reinterpret_cast<T*>(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false));
-    decoder_layer_output_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false));
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LLaMADecoder<T>::freeBuffer()
-{
-    if (is_allocate_buffer_ == true) {
-        allocator_->free((void**)(&decoder_normed_input_));
-        allocator_->free((void**)(&self_attn_output_));
-        allocator_->free((void**)(&ffn_output_));
-        allocator_->free((void**)(&decoder_layer_output_));
-        is_allocate_buffer_ = false;
-    }
-}
-
-template<typename T>
-bool LLaMADecoder<T>::isValidLayerParallelId(uint l)
-{
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
-           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
-}
-
-template<typename T>
-bool LLaMADecoder<T>::isFirstLayerParallelId(uint l)
-{
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
-}
-
-template<typename T>
-bool LLaMADecoder<T>::isLastLayerParallelId(uint l)
-{
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
-}
-
-template<typename T>
-int LLaMADecoder<T>::getFirstLayerParallelId()
-{
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return local_num_layer * pipeline_para_.rank_;
-}
-
-template<typename T>
-LLaMADecoder<T>::LLaMADecoder(size_t                              head_num,
-                              size_t                              size_per_head,
-                              size_t                              inter_size,
-                              size_t                              num_layer,
-                              size_t                              rotary_embedding_dim,
-                              bool                                neox_rotary_style,
-                              float                               layernorm_eps,
-                              NcclParam                           pipeline_para,
-                              cudaStream_t                        stream,
-                              cublasMMWrapper*                    cublas_wrapper,
-                              IAllocator*                         allocator,
-                              bool                                is_free_buffer_after_forward,
-                              std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                              int                                 enable_custom_all_reduce):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    inter_size_(inter_size),
-    num_layer_(num_layer),
-    rotary_embedding_dim_(rotary_embedding_dim),
-    neox_rotary_style_(neox_rotary_style),
-    layernorm_eps_(layernorm_eps),
-    hidden_units_(head_num_ * size_per_head),
-    pipeline_para_(pipeline_para),
-    custom_all_reduce_comm_(custom_all_reduce_comm),
-    enable_custom_all_reduce_(enable_custom_all_reduce)
-{
-    initialize();
-}
-
-template<typename T>
-LLaMADecoder<T>::LLaMADecoder(LLaMADecoder<T> const& decoder):
-    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
-    head_num_(decoder.head_num_),
-    size_per_head_(decoder.size_per_head_),
-    inter_size_(decoder.inter_size_),
-    num_layer_(decoder.num_layer_),
-    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
-    neox_rotary_style_(decoder.neox_rotary_style_),
-    layernorm_eps_(decoder.layernorm_eps_),
-    hidden_units_(decoder.hidden_units_),
-    pipeline_para_(decoder.pipeline_para_),
-    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
-    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
-{
-    initialize();
-}
-
-template<typename T>
-LLaMADecoder<T>::~LLaMADecoder()
-{
-    delete self_attention_layer_;
-    delete ffn_layer_;
-    freeBuffer();
-}
-
-template<typename T>
-void LLaMADecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
-                              const std::vector<Tensor>*                      input_tensors,
-                              const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LLaMADecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
-                              const std::unordered_map<std::string, Tensor>*  input_tensors,
-                              const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
-{
-    // input tensors:
-    //      decoder_input [local_batch_size, hidden_dimension],
-    //      finished [local_batch_size],
-    //      sequence_lengths [local_batch_size]
-    //      total_padding_tokens [local_batch_size],
-    //      max_input_length [1] on cpu
-    //      step [1] on cpu
-    //      ite [1] on cpu
-    //      cache_indirection [local_batch_size / beam_width, beam_width, memory_len]
-    //              Here, local_batch_size contains the beam_width, so local_batch_size / beam_width
-    //              is real local_batch_size.
-    //      masked_tokens[local_batch_size, memory_len]
-
-    // output tensors:
-    //      decoder_output [local_batch_size, hidden_dimension],
-    //      key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x]
-    //      value_cache [num_layer, batch_size, head_num, memory_len, size_per_head]
-
-    FT_CHECK(input_tensors->size() == 11);
-    FT_CHECK(output_tensors->size() == 3);
-
-    const DataType data_type        = getTensorType<T>();
-    const size_t   local_batch_size = input_tensors->at("decoder_input").shape[0];
-    allocateBuffer(local_batch_size);
-    const int ite = input_tensors->at("ite").getVal<const int>();
-
-    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
-    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
-
-    Tensor&             k_cache = output_tensors->at("key_cache");
-    Tensor&             v_cache = output_tensors->at("value_cache");
-    std::vector<size_t> self_k_cache_size;
-    self_k_cache_size.push_back(local_batch_size);
-    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
-        self_k_cache_size.push_back(*t);
-    }
-    std::vector<size_t> self_v_cache_size;
-    self_v_cache_size.push_back(local_batch_size);
-    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
-        self_v_cache_size.push_back(*t);
-    }
-
-    for (uint l = 0; l < num_layer_; l++) {
-        if (isValidLayerParallelId(l) == false) {
-            continue;
-        }
-        T* layer_input  = (l == 0) ? decoder_input : decoder_layer_output_;
-        T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_;
-
-        if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
-            int data_size = local_batch_size * hidden_units_;
-            // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
-            // stream_);
-
-            ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
-        }
-
-        invokeGeneralLLaMALayerNorm(decoder_normed_input_,
-                                    layer_input,
-                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
-                                    layernorm_eps_,
-                                    local_batch_size,
-                                    hidden_units_,
-                                    stream_);
-        sync_check_cuda_error();
-
-        TensorMap self_attention_input_tensors(*input_tensors);
-        self_attention_input_tensors.insert(
-            "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_});
-
-        size_t cache_offset = l - getFirstLayerParallelId();
-        for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
-            cache_offset *= *t;
-        };
-        size_t ite_cache_offset = ite * local_batch_size;
-        for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
-            ite_cache_offset *= *t;
-        }
-        cache_offset += ite_cache_offset;
-
-        TensorMap self_attention_output_tensors{
-            {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}},
-            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
-
-        self_attention_layer_->forward(&self_attention_output_tensors,
-                                       &self_attention_input_tensors,
-                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
-
-        invokeGeneralAddBiasResidualPreLayerNorm(
-            self_attn_output_,
-            decoder_normed_input_,
-            self_attn_output_,
-            layer_input,
-            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-            llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
-            layernorm_eps_,
-            local_batch_size,
-            hidden_units_,
-            (float*)nullptr,
-            (float*)nullptr,
-            (float*)nullptr,
-            (float*)nullptr,
-            0,
-            stream_);
-
-        TensorMap ffn_input_tensors(
-            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}});
-        TensorMap ffn_output_tensors(
-            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, layer_output}}});
-        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
-
-        invokeAddBiasResidual(layer_output,
-                              self_attn_output_,
-                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                              local_batch_size,
-                              hidden_units_,
-                              stream_);
-
-        sync_check_cuda_error();
-
-        if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
-            && pipeline_para_.world_size_ > 1) {
-            int data_size = local_batch_size * hidden_units_;
-            ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
-        }
-    }
-
-    if (is_free_buffer_after_forward_ == true) {
-        freeBuffer();
-    }
-}
-
-template class LLaMADecoder<float>;
-template class LLaMADecoder<half>;
-
-}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoder.h b/src/fastertransformer/models/llama/LLaMADecoder.h
deleted file mode 100644
index 773637d65..000000000
--- a/src/fastertransformer/models/llama/LLaMADecoder.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <vector>
-
-#include "src/fastertransformer/kernels/add_residual_kernels.h"
-#include "src/fastertransformer/kernels/layernorm_kernels.h"
-#include "src/fastertransformer/layers/BaseLayer.h"
-#include "src/fastertransformer/layers/FfnLayer.h"
-#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
-#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
-
-namespace fastertransformer {
-
-template<typename T>
-class LLaMADecoder: public BaseLayer {
-private:
-protected:
-    void         allocateBuffer() override;
-    void         allocateBuffer(size_t batch_size);
-    void         freeBuffer() override;
-    bool         isValidLayerParallelId(uint l);
-    bool         isFirstLayerParallelId(uint l);
-    bool         isLastLayerParallelId(uint l);
-    int          getFirstLayerParallelId();
-    virtual void initialize();
-
-    // meta data
-    size_t head_num_;
-    size_t size_per_head_;
-    size_t inter_size_;
-    size_t num_layer_;
-    size_t rotary_embedding_dim_;
-    bool   neox_rotary_style_;
-    size_t hidden_units_;
-    float  layernorm_eps_;
-
-    NcclParam pipeline_para_;
-
-    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
-    int                                 enable_custom_all_reduce_;
-
-    T* decoder_normed_input_ = nullptr;
-    T* self_attn_output_     = nullptr;
-    T* ffn_output_           = nullptr;
-    T* decoder_layer_output_ = nullptr;
-
-    BaseAttentionLayer<T>* self_attention_layer_;
-    FfnLayer<T>*           ffn_layer_;
-
-public:
-    LLaMADecoder(size_t                              head_num,
-                   size_t                              size_per_head,
-                   size_t                              inter_size,
-                   size_t                              num_layer,
-                   size_t                              rotary_embedding_dim,
-                   bool                                neox_rotary_style,
-                   float                               layernorm_eps,
-                   NcclParam                           pipeline_para,
-                   cudaStream_t                        stream,
-                   cublasMMWrapper*                    cublas_wrapper,
-                   IAllocator*                         allocator,
-                   bool                                is_free_buffer_after_forward,
-                   std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
-                   int                                 enable_custom_all_reduce_ = 0);
-
-    LLaMADecoder(LLaMADecoder<T> const& decoder);
-
-    virtual ~LLaMADecoder();
-
-    virtual void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
-                         const std::unordered_map<std::string, Tensor>*    input_tensors,
-                         const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
-
-    virtual void forward(std::vector<Tensor>*                              output_tensors,
-                         const std::vector<Tensor>*                        input_tensors,
-                         const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
-};
-
-}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
index 3c40613fc..ff2ec11be 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -59,19 +59,19 @@ LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeigh
     hidden_units_(other.hidden_units_), inter_size_(other.inter_size_)
 {
     mallocWeights();
-    //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); nullptr;
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
     cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
-    //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
     cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
     cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
-    //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
-    //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
     cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
     setWeightPtr();
 }
@@ -84,19 +84,19 @@ LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADec
 
     mallocWeights();
 
-    //cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
     cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
-    //cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
     cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
     cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
-    //cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
-    //cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
     cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
     setWeightPtr();
     return *this;
@@ -107,8 +107,8 @@ void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
 {
     FT_CHECK(is_maintain_buffer == true);
 
-//    loadWeightFromBin<T>(
-//        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type);
 
@@ -116,67 +116,60 @@ void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
                          {(size_t)hidden_units_, (size_t)(3 * hidden_units_)},
                          dir_path + ".attention.query_key_value.weight.bin",
                          model_file_type);
-//    loadWeightFromBin<T>(weights_ptr[3],
-//                         {(size_t)(3 * hidden_units_)},
-//                         dir_path + ".attention.query_key_value.bias.bin",
-//                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(3 * hidden_units_)},
+                         dir_path + ".attention.query_key_value.bias.bin",
+                         model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[4],
                          {(size_t)(hidden_units_), (size_t)hidden_units_},
                          dir_path + ".attention.wo.weight.bin",
                          model_file_type);
-//    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_)},
                          dir_path + ".feed_forward.w1.weight.bin",
                          model_file_type);
-//    loadWeightFromBin<T>(
-//        weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[8],
                          {(size_t)(inter_size_), (size_t)hidden_units_},
                          dir_path + ".feed_forward.w2.weight.bin",
                          model_file_type);
-//    loadWeightFromBin<T>(
-//        weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[10],
                          {(size_t)hidden_units_, (size_t)(inter_size_)},
                          dir_path + ".feed_forward.w3.weight.bin",
                          model_file_type);
-//    loadWeightFromBin<T>(
-//        weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type);
 
-//    loadWeightFromBin<T>(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type);
 }
 
 template<typename T>
 void LLaMADecoderLayerWeight<T>::setWeightPtr()
 {
-    //pre_layernorm_weights.beta                            = weights_ptr[0];
-    pre_layernorm_weights.beta                            = nullptr;
+    pre_layernorm_weights.beta                            = weights_ptr[0];
     pre_layernorm_weights.gamma                           = weights_ptr[1];
     self_attention_weights.query_weight.kernel            = weights_ptr[2];
-    //self_attention_weights.query_weight.bias              = weights_ptr[3];
-    self_attention_weights.query_weight.bias              = nullptr;
+    self_attention_weights.query_weight.bias              = weights_ptr[3];
     self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
-    //self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
-    self_attention_weights.attention_output_weight.bias   = nullptr;
+    self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
 
     ffn_weights.intermediate_weight.kernel = weights_ptr[6];
-    //ffn_weights.intermediate_weight.bias   = weights_ptr[7];
-    ffn_weights.intermediate_weight.bias   = nullptr;
+    ffn_weights.intermediate_weight.bias   = weights_ptr[7];
     ffn_weights.output_weight.kernel       = weights_ptr[8];
-    //ffn_weights.output_weight.bias         = weights_ptr[9];
-    ffn_weights.output_weight.bias         = nullptr;
+    ffn_weights.output_weight.bias         = weights_ptr[9];
     ffn_weights.gating_weight.kernel       = weights_ptr[10];
-    //ffn_weights.gating_weight.bias         = weights_ptr[11];
-    ffn_weights.gating_weight.bias         = nullptr;
+    ffn_weights.gating_weight.bias         = weights_ptr[11];
 
-    //post_attention_layernorm_weights.beta  = weights_ptr[12];
-    post_attention_layernorm_weights.beta  = nullptr;
+    post_attention_layernorm_weights.beta  = weights_ptr[12];
     post_attention_layernorm_weights.gamma = weights_ptr[13];
     is_maintain_buffer                     = true;
 }
@@ -184,20 +177,20 @@ void LLaMADecoderLayerWeight<T>::setWeightPtr()
 template<typename T>
 void LLaMADecoderLayerWeight<T>::mallocWeights()
 {
-    //deviceMalloc(&weights_ptr[0], hidden_units_);
+    deviceMalloc(&weights_ptr[0], hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
     deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_);
-    //deviceMalloc(&weights_ptr[3], 3 * hidden_units_);
+    deviceMalloc(&weights_ptr[3], 3 * hidden_units_);
     deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_);
-    //deviceMalloc(&weights_ptr[5], hidden_units_);
+    deviceMalloc(&weights_ptr[5], hidden_units_);
 
     deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_);
-    //deviceMalloc(&weights_ptr[7], inter_size_);
+    deviceMalloc(&weights_ptr[7], inter_size_);
     deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_);
-    //deviceMalloc(&weights_ptr[9], hidden_units_);
+    deviceMalloc(&weights_ptr[9], hidden_units_);
     deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_);
-    //deviceMalloc(&weights_ptr[11], inter_size_);
-    //deviceMalloc(&weights_ptr[12], hidden_units_);
+    deviceMalloc(&weights_ptr[11], inter_size_);
+    deviceMalloc(&weights_ptr[12], hidden_units_);
     deviceMalloc(&weights_ptr[13], hidden_units_);
 }
 
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index 81a22a51d..f7081de11 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -78,7 +78,7 @@ LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
 {
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -105,7 +105,7 @@ LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
 
     mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
-    //cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
     cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -123,7 +123,7 @@ template<typename T>
 void LLaMAWeight<T>::setWeightPtr()
 {
     pre_decoder_embedding_table   = weights_ptr[0];
-    //post_decoder_layernorm.beta   = weights_ptr[1];
+    post_decoder_layernorm.beta   = weights_ptr[1];
     post_decoder_layernorm.beta   = nullptr;
     post_decoder_layernorm.gamma  = weights_ptr[2];
     post_decoder_embedding.kernel = weights_ptr[3];
@@ -135,7 +135,7 @@ void LLaMAWeight<T>::mallocWeights()
     weights_ptr.resize(num_base_weights);
 
     deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
-    //deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
     deviceMalloc(&weights_ptr[2], hidden_units_);
     deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
 
@@ -152,7 +152,7 @@ void LLaMAWeight<T>::loadModel(std::string dir_path)
                          {(size_t)(vocab_size_ * hidden_units_)},
                          dir_path + "/model.tok_embeddings.weight.bin",
                          model_file_type);
-    //loadWeightFromBin<T>(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type);
     loadWeightFromBin<T>(weights_ptr[3],
                          {(size_t)(vocab_size_ * hidden_units_)},

From 1494d2fe74c3b6a9226b30a9139a0c38ea66df96 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 15 Sep 2023 23:40:07 +0000
Subject: [PATCH 11/55] dump

---
 examples/cpp/llama/llama_config.ini           |   6 +-
 examples/cpp/llama/llama_example.cc           |  74 +---
 examples/cpp/llama/llama_example_utils.cc     |   6 +-
 examples/cpp/llama/start_ids.csv              |  12 +-
 .../LLaMAContextAttentionLayer.cc             | 342 ++++++---------
 .../LLaMAContextAttentionLayer.h              |  87 ++--
 src/fastertransformer/models/llama/LLaMA.cc   | 390 ++++++------------
 src/fastertransformer/models/llama/LLaMA.h    |  22 +-
 .../models/llama/LLaMAContextDecoder.cc       | 276 ++++++-------
 .../models/llama/LLaMAWeight.cc               |   1 -
 10 files changed, 437 insertions(+), 779 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 68f4663d1..9cb766533 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -7,8 +7,7 @@ pipeline_para_size=4
 
 [request]
 beam_width=1 # beam width for beam search
-request_batch_size=8 # determine by the request
-request_output_len=0 # determine by the request
+request_batch_size=4 # determine by the request
 
 [llama_33B]
 head_num=52
@@ -17,5 +16,4 @@ vocab_size=32000
 decoder_layers=60
 rotary_embedding=128
 multiple_of=256
-start_id=0
-end_id=2
+padding_id=0
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index c1f4521bf..2955cbb14 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,16 +81,13 @@ void llama_example(const INIReader reader)
     const size_t decoder_layers       = reader.GetInteger(model_name, "decoder_layers");
     const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
     const int    multiple_of          = reader.GetInteger(model_name, "multiple_of");
-    const int    start_id             = reader.GetInteger(model_name, "start_id");
-    const int    end_id               = reader.GetInteger(model_name, "end_id");
 
     const size_t hidden_units = head_num * size_per_head;
     const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of);
 
-    const size_t beam_width         = reader.GetInteger("request", "beam_width");
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
-    const int    request_output_len = reader.GetInteger("request", "request_output_len");
     const int    min_length         = reader.GetInteger("request", "min_length", 0);
+    const int    padding_id           = reader.GetInteger(model_name, "padding_id");
 
     FT_CHECK(decoder_layers % pipeline_para_size == 0);
 
@@ -128,29 +125,6 @@ void llama_example(const INIReader reader)
     NcclParam pipeline_para;
     ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size);
 
-    // Handle bad_words dictionary
-    std::vector<int> bad_words;
-    read_word_list("../examples/cpp/llama/bad_words.csv", bad_words);
-
-    int* d_bad_words = nullptr;
-    deviceMalloc(&d_bad_words, bad_words.size(), false);
-    cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size());
-
-    // Handle stop_words dictionary
-    std::vector<int> stop_words;
-    read_word_list("../examples/cpp/llama/stop_words.csv", stop_words);
-
-    const size_t stop_words_len = stop_words.size() / 2;
-    // Tile with same dict for each element
-    std::vector<int> tiled_stop_words;
-    for (int i = 0; i < request_batch_size; i++) {
-        tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
-    }
-
-    int* d_stop_words = nullptr;
-    deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
-    cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
-
     // Read ids of request from file.
     size_t           max_input_len = -1;
     std::vector<int> v_start_lengths;
@@ -159,7 +133,7 @@ void llama_example(const INIReader reader)
                    &v_start_lengths,
                    &v_start_ids,
                    max_input_len,
-                   end_id,
+                   padding_id,
                    1,
                    "../examples/cpp/llama/start_ids.csv");
 
@@ -177,10 +151,8 @@ void llama_example(const INIReader reader)
         cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len);
         cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size);
     }
-    std::vector<int> start_ids(request_batch_size, start_id);
-    std::vector<int> end_ids(request_batch_size, end_id);
 
-    const int total_output_len = max_input_len + request_output_len;
+    const int total_output_len = max_input_len;
 
     cudaStream_t     stream;
     cublasHandle_t   cublas_handle;
@@ -203,12 +175,8 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
-    fastertransformer::LLaMAWeight<T> llama_weights(hidden_units,
-                                                    inter_size,
-                                                    vocab_size,
-                                                    decoder_layers,
-                                                    pipeline_para.world_size_,
-                                                    pipeline_para.rank_);
+    fastertransformer::LLaMAWeight<T> llama_weights(
+        hidden_units, inter_size, vocab_size, decoder_layers, pipeline_para.world_size_, pipeline_para.rank_);
 
     model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
     llama_weights.loadModel(model_dir);
@@ -234,22 +202,20 @@ void llama_example(const INIReader reader)
                               decoder_layers,
                               vocab_size,
                               rotary_embedding_dim,
-                              start_id,
-                              end_id,
                               random_seed,
                               tensor_para,
                               pipeline_para,
                               stream,
                               &cublas_wrapper,
                               &allocator,
-                              false,
+                              false,  // is_free_buffer_after_forward
                               &prop,
                               attention_type);
 
     int* d_output_ids;
     int* d_sequence_lengths;
-    deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
-    deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+    deviceMalloc(&d_output_ids, request_batch_size * total_output_len, false);
+    deviceMalloc(&d_sequence_lengths, request_batch_size, false);
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -257,27 +223,18 @@ void llama_example(const INIReader reader)
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_input_lengths}},
         {"output_seq_len",
          Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
-        {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}},
-        {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}},
         {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
-        {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, start_ids.data()}},
-        {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, end_ids.data()}}};
-
-    input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
+        {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}}};
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
         {"output_ids",
          Tensor{MEMORY_GPU,
                 TYPE_INT32,
-                std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
+                std::vector<size_t>{request_batch_size, 1, (size_t)total_output_len},
                 d_output_ids}},
         {"sequence_length",
-         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths}},
-        {"output_log_probs",
-         Tensor{MEMORY_GPU,
-                TYPE_FP32,
-                std::vector<size_t>{(size_t)request_output_len, request_batch_size, beam_width},
-                nullptr}}};
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_sequence_lengths}},
+    };
 
     print_mem_usage();
 
@@ -307,7 +264,7 @@ void llama_example(const INIReader reader)
             printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
         }
         else {
-            size_t outCount = total_output_len * request_batch_size * beam_width;
+            size_t outCount = total_output_len * request_batch_size;
             int*   hBuf     = new int[outCount];
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
@@ -357,10 +314,9 @@ void llama_example(const INIReader reader)
 
     cudaProfilerStop();
 
-    printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d"
+    printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d"
            " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n",
            request_batch_size,
-           beam_width,
            head_num,
            size_per_head,
            total_output_len,
@@ -374,8 +330,6 @@ void llama_example(const INIReader reader)
     delete cublas_algo_map;
     delete cublas_wrapper_mutex;
 
-    cudaFree(d_bad_words);
-    cudaFree(d_stop_words);
     if (d_input_ids != nullptr) {
         cudaFree(d_input_ids);
     }
diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc
index 77f621dbf..d6db80856 100644
--- a/examples/cpp/llama/llama_example_utils.cc
+++ b/examples/cpp/llama/llama_example_utils.cc
@@ -26,7 +26,7 @@ int read_start_ids(size_t            batch_size,
                    std::vector<int>* v_start_lengths,
                    std::vector<int>* v_start_ids,
                    size_t&           max_input_len,
-                   const int         end_id,
+                   const int         padding_id,
                    const int         beam_width,
                    std::string       file_name)
 {
@@ -68,7 +68,7 @@ int read_start_ids(size_t            batch_size,
     while ((int)tmp_start_lengths.size() < batch_size) {
         std::vector<int> padding_ids;
         for (int i = 0; i < max_input_len; i++) {
-            padding_ids.push_back(end_id);
+            padding_ids.push_back(padding_id);
         }
         tmp_start_ids.push_back(padding_ids);
         tmp_start_lengths.push_back(max_input_len);
@@ -77,7 +77,7 @@ int read_start_ids(size_t            batch_size,
     // Add padding
     for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
         for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {
-            tmp_start_ids[i].push_back(end_id);
+            tmp_start_ids[i].push_back(padding_id);
         }
     }
 
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index 88e742f39..a74083153 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,8 +1,4 @@
-688, 253, 1390, 4564, 273, 1897, 13, 247
-510, 1457, 8911, 4487, 273, 26593, 310, 6600
-510, 1457, 2816, 28260, 452, 247, 747, 1481
-510, 1457, 2816, 7717, 556, 3863, 697, 7970
-688, 247, 2118, 326, 588, 2779, 1056, 352
-510, 1457, 2816, 28260, 8, 13413, 19169, 14745
-510, 9462, 5687, 556, 38350, 26212, 253, 747
-510, 806, 673, 309, 3047, 253, 6440, 13
\ No newline at end of file
+1, 14542,  3262,  8112, 29901, 7803,  1757,  526, 13407,   297,  263, 13569, 29889,  2688,  526, 13587,  701, 27815, 29889,     0
+1,  5057, 12500, 29901,   319, 8023,   338, 2734,  1623,   263, 5702, 29889,   450,  8023, 6057,   964,  263,  1559, 29889,     0
+1,  5057, 12500, 29901,   319, 8023,   338, 2734,  1623,   263, 5702, 29889,   450,  8023, 4947,   297,  263,  1775, 29889,     0
+1, 28551,   292, 29901, 11647,  526, 13407,  373,   263, 17306,  310, 15007, 29889, 11647,  526,  1985, 2768,   263,  5214, 29889
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 38ec79b47..8837acb82 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -16,6 +16,7 @@
  */
 
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
 #include "src/fastertransformer/kernels/unfused_attention_kernels.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 
@@ -28,16 +29,15 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 {
     // input_tensors:
     //      input_query [token_num, hidden_dimension]
-    //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
+    //      attention_mask [batch_size, 1, seq_len, seq_len]
     //      attention_type [1]
     //      is_final_layer [1], bool on cpu
     //      layer_id [1], int on cpu
     //      padding_offset, int, [token_num] (optional)
     //      cu_seqlens, int, [batch_size] (optional)
-    //      d_prefix_prompt_batch [global_batch_size], (optional)
-    //          each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head]
-    //      d_prefix_prompt_lengths [batch_size], int (optional)
-    //      linear_bias_slopes [head_num] (optional)
+    //          each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head]
+    //      pre_layernorm_weights_gamma [hidden_dimension]
+    //      pre_layernorm_weights_beta  [hidden_dimension]
 
     // output_tensors:
     //      hidden_features [token_num, hidden_dimension]
@@ -45,20 +45,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 5);
-    FT_CHECK(output_tensors->at("value_cache").shape.size() == 4
-             || output_tensors->at("value_cache").shape.size() == 3);
-    const int request_batch_size = input_tensors->at("attention_mask").shape[0];
-    const int request_seq_len    = input_tensors->at("attention_mask").shape[2];
-    const int max_prompt_length =
-        input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
-    const int  layer_id                = input_tensors->getVal<int>("layer_id");
-    const T**  d_prefix_prompt_batch   = input_tensors->getPtr<const T*>("d_prefix_prompt_batch", nullptr);
-    const int* d_prefix_prompt_lengths = input_tensors->getPtr<int>("d_prefix_prompt_lengths", nullptr);
-    const int* padding_offset          = input_tensors->getPtr<int>("padding_offset", nullptr);
-    int*       cu_seqlens              = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    T*         linear_bias_slopes      = input_tensors->getPtr<T>("linear_bias_slopes", nullptr);
-    /* float*     attention_query_dynamic_scale = input_tensors->getPtr<float>("attention_query_dynamic_scale",
-     * nullptr); */
+    FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
+    const int  request_batch_size          = input_tensors->at("attention_mask").shape[0];
+    const int  request_seq_len             = input_tensors->at("attention_mask").shape[2];
+    const int  layer_id                    = input_tensors->getVal<int>("layer_id");
+    const int* padding_offset              = input_tensors->getPtr<int>("padding_offset", nullptr);
+    int*       cu_seqlens                  = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const T*   pre_layernorm_weights_gamma = input_tensors->getPtr<T>("pre_layernorm_weights_gamma");
+    const T*   pre_layernorm_weights_beta  = input_tensors->getPtr<T>("pre_layernorm_weights_beta");
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -69,89 +63,84 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                        "LLaMA Context FUSED_PADDED_MHA is not supported !");
 
     PUSH_RANGE("attention buffer alloc");
-    allocateBuffer(request_batch_size, request_seq_len + max_prompt_length, attention_type != AttentionType::FUSED_MHA);
+    allocateBuffer(request_batch_size, request_seq_len, attention_type != AttentionType::FUSED_MHA);
     POP_RANGE;
     sync_check_cuda_error();
 
     const bool is_final = input_tensors->at("is_final_layer").getVal<bool>();
+    const int  m        = input_tensors->at("input_query").shape[0];
 
-    const int m = input_tensors->at("input_query").shape[0];
+    PUSH_RANGE("attention buffer alloc");
+    invokeGeneralLLaMALayerNorm(decoder_normed_input_,
+                                attention_input,
+                                pre_layernorm_weights_gamma,
+                                pre_layernorm_weights_beta,
+                                layernorm_eps_,
+                                m,
+                                hidden_units_,
+                                stream_);
+    sync_check_cuda_error();
+    POP_RANGE;
+    //    if (l == 0) {
+    //        T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_);
+    //        cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < h_token_num; ++b) {
+    //            std::cout << "[";
+    //            int i = 0;
+    //            for (int h = 0; h < hidden_units_; ++h) {
+    //                std::cout << out[b * hidden_units_ + h] << " ";
+    //                ++i;
+    //                if (i == 8)
+    //                    break;
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
+    sync_check_cuda_error();
 
     PUSH_RANGE("qkv_gemm");
 
-#ifdef SPARSITY_ENABLED
-    const int m_padded   = 8 * div_up(m, 8);
-    bool      use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, 3 * local_hidden_units_, m_padded, hidden_units_);
-#else
-    constexpr bool use_sparse = false;
-#endif
-
-    if (use_sparse) {
-#ifdef SPARSITY_ENABLED
-        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                3 * local_hidden_units_,
-                                m_padded,
-                                hidden_units_,
-                                attention_weights->query_weight.sp_kernel,
-                                attention_input,
-                                qkv_buf_);
-#endif
-    }
-    else if (int8_mode_ == 1) {
-        FT_CHECK(weight_only_int8_fc_runner_.get() != NULL && attention_weights->query_weight.int8_kernel != NULL
-                 && attention_weights->query_weight.weight_only_quant_scale != NULL);
-
-        weight_only_int8_fc_runner_->gemm(attention_input,
-                                          reinterpret_cast<const uint8_t*>(attention_weights->query_weight.int8_kernel),
-                                          attention_weights->query_weight.weight_only_quant_scale,
-                                          qkv_buf_,
-                                          m,
-                                          3 * local_hidden_units_,
-                                          hidden_units_,
-                                          mixed_gemm_workspace_,
-                                          mixed_gemm_ws_bytes_,
-                                          stream_);
-    }
-    else if (int8_mode_ == 2) {
-        cublas_wrapper_->Int8Gemm(3 * local_hidden_units_,
-                                  m,
-                                  hidden_units_,
-                                  attention_weights->query_weight.int8_kernel,
-                                  hidden_units_,
-                                  input_tensors->at("input_query").getPtr<int8_t>(),
-                                  hidden_units_,
-                                  reinterpret_cast<int8_t*>(qkv_buf_),
-                                  3 * local_hidden_units_,
-                                  attention_weights->query_weight.scale_inter,
-                                  true);
-    }
-    else {
-        cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                              CUBLAS_OP_N,
-                              3 * local_hidden_units_,  // n
-                              m,
-                              hidden_units_,  // k
-                              attention_weights->query_weight.kernel,
-                              3 * local_hidden_units_,  // n
-                              attention_input,
-                              hidden_units_,  // k
-                              qkv_buf_,
-                              3 * local_hidden_units_ /* n */);
-    }
-
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          3 * hidden_units_,  // n
+                          m,
+                          hidden_units_,  // k
+                          attention_weights->query_weight.kernel,
+                          3 * hidden_units_,  // n
+                          decoder_normed_input_,
+                          hidden_units_,  // k
+                          qkv_buf_,
+                          3 * hidden_units_ /* n */);
     sync_check_cuda_error();
+    //    if (layer_id == 0) {
+    //        T* qkv_buf = (T*)malloc(sizeof(T) * m * 3 * hidden_units_);
+    //        cudaMemcpy(qkv_buf, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < request_batch_size; ++b) {
+    //            std::cout << "[";
+    //            for (int s = 0; s < request_seq_len; ++s) {
+    //                std::cout << "[";
+    //                for (int h = 0; h < 8; ++h) {
+    //                    std::cout << qkv_buf[((b * request_seq_len) + s) * 3 * hidden_units_ + h + 2 * hidden_units_]
+    //                    << " ";
+    //                }
+    //                std::cout << "]\n";
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
 
     // IDEA: append prefix prompt key value here
-    PrefixPromptBatchWeightsParam<T> param{d_prefix_prompt_batch,
-                                           d_prefix_prompt_lengths,
-                                           max_prompt_length,
-                                           (size_t)layer_id * 2 * local_head_num_ * size_per_head_};
+    PrefixPromptBatchWeightsParam<T> param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_};
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(
-            q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_);
+        cudaMemsetAsync(q_buf_2_, 0, request_batch_size * request_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
     }
     invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                    k_buf_2_,
@@ -163,12 +152,12 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    request_batch_size,
                                    request_seq_len,
                                    m,
-                                   local_head_num_,
+                                   head_num_,
                                    size_per_head_,
                                    rotary_embedding_dim_,
                                    neox_rotary_style_,
                                    attention_weights->query_weight.scale_out,
-                                   int8_mode_,
+                                   0,  // int8_mode
                                    stream_);
     sync_check_cuda_error();
 
@@ -181,13 +170,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                 k_buf_2_,
                                 v_buf_2_,
                                 request_batch_size,
-                                max_prompt_length + request_seq_len,  // max input length + prefix prompt length
+                                request_seq_len,
                                 max_seq_len,
                                 size_per_head_,
-                                local_head_num_,
+                                head_num_,
                                 stream_);
-    // IDEA : after this, k_cache = (batch_size, num_heads, Dh/x, prefix_prompt_len + L, x)
-    // k_cache = (batch_size, num_heads, prefix_prompt_len + L, Dh)
+    // IDEA : after this, 
+    // k_cache = (batch_size, num_heads, Dh/x, L, x)
+    // v_cache = (batch_size, num_heads, L, Dh)
     sync_check_cuda_error();
 
     // TODO: fmha kernels doesn't support different seq lengths of q and kv
@@ -200,8 +190,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     POP_RANGE;
     if (is_final == false) {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = request_seq_len;                      // q length
-        const int            attention_seq_len_2 = max_prompt_length + request_seq_len;  // kv length
+        const int            attention_seq_len_1 = request_seq_len;  // q length
+        const int            attention_seq_len_2 = request_seq_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
         if (attention_type != AttentionType::FUSED_MHA) {
             if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
@@ -225,7 +215,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                     CUDA_R_32F,
                                                     attention_seq_len_2,  // n
                                                     attention_seq_len_2 * attention_seq_len_1,
-                                                    request_batch_size * local_head_num_,  // global batch size
+                                                    request_batch_size * head_num_,  // global batch size
                                                     CUDA_R_32F);
 
                 sync_check_cuda_error();
@@ -239,9 +229,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                 param.batch_size         = request_batch_size;
                 param.q_length           = attention_seq_len_1;
                 param.k_length           = attention_seq_len_2;
-                param.num_heads          = local_head_num_;
+                param.num_heads          = head_num_;
                 param.qk_scale           = qk_scale;
-                param.linear_bias_slopes = const_cast<T*>(linear_bias_slopes);  // (head_num,), optional
+                param.linear_bias_slopes = nullptr;
                 invokeMaskedSoftmax(param, stream_);
                 sync_check_cuda_error();
                 POP_RANGE;
@@ -262,7 +252,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                     qk_buf_,
                                                     attention_seq_len_2,
                                                     attention_seq_len_2 * attention_seq_len_1,
-                                                    request_batch_size * local_head_num_);
+                                                    request_batch_size * head_num_);
 
                 POP_RANGE;
                 PUSH_RANGE("softmax");
@@ -273,9 +263,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                 param.batch_size         = request_batch_size;
                 param.q_length           = attention_seq_len_1;
                 param.k_length           = attention_seq_len_2;
-                param.num_heads          = local_head_num_;
+                param.num_heads          = head_num_;
                 param.qk_scale           = qk_scale;
-                param.linear_bias_slopes = const_cast<T*>(linear_bias_slopes);  // (head_num,), optional
+                param.linear_bias_slopes = nullptr;
                 invokeMaskedSoftmax(param, stream_);
                 sync_check_cuda_error();
                 POP_RANGE;
@@ -297,7 +287,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                 qkv_buf_2_,
                                                 size_per_head_,
                                                 attention_seq_len_1 * size_per_head_,
-                                                request_batch_size * local_head_num_);
+                                                request_batch_size * head_num_);
 
             // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
             if (padding_offset == nullptr) {
@@ -305,10 +295,10 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    qkv_buf_2_,
                                    request_batch_size,
                                    attention_seq_len_1,
-                                   local_head_num_,
+                                   head_num_,
                                    size_per_head_,
                                    attention_weights->attention_output_weight.scale,
-                                   int8_mode_,
+                                   0,  // int8_mode
                                    stream_);
                 sync_check_cuda_error();
             }
@@ -318,11 +308,11 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                          m,
                                                          request_batch_size,
                                                          attention_seq_len_1,
-                                                         local_head_num_,
+                                                         head_num_,
                                                          size_per_head_,
                                                          padding_offset,
                                                          attention_weights->attention_output_weight.scale,
-                                                         int8_mode_,
+                                                         0,  // int8_mode
                                                          stream_);
             }
             POP_RANGE;
@@ -330,68 +320,18 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         sync_check_cuda_error();
 
         PUSH_RANGE("proj gemm");
-#ifdef SPARSITY_ENABLED
-        bool use_sparse = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m_padded, local_hidden_units_);
-#endif
 
-        if (use_sparse) {
-#ifdef SPARSITY_ENABLED
-            cublas_wrapper_->SpGemm(CUBLAS_OP_N,
-                                    CUBLAS_OP_N,
-                                    hidden_units_,
-                                    m_padded,
-                                    local_hidden_units_,
-                                    attention_weights->attention_output_weight.sp_kernel,
-                                    qkv_buf_3_,
-                                    attention_out);
-#endif
-        }
-        else {
-            if (int8_mode_ == 1) {
-                FT_CHECK(weight_only_int8_fc_runner_.get() != NULL
-                         && attention_weights->attention_output_weight.int8_kernel != NULL
-                         && attention_weights->attention_output_weight.weight_only_quant_scale != NULL);
-
-                weight_only_int8_fc_runner_->gemm(
-                    qkv_buf_3_,
-                    reinterpret_cast<const uint8_t*>(attention_weights->attention_output_weight.int8_kernel),
-                    attention_weights->attention_output_weight.weight_only_quant_scale,
-                    attention_out,
-                    m,
-                    hidden_units_,
-                    local_hidden_units_,
-                    mixed_gemm_workspace_,
-                    mixed_gemm_ws_bytes_,
-                    stream_);
-            }
-            else if (int8_mode_ == 2) {
-                int8_fc_runner_->gemm(reinterpret_cast<int8_t*>(qkv_buf_3_),
-                                      attention_weights->attention_output_weight.int8_kernel,
-                                      QuantMode::PerTensorQuant,
-                                      attention_weights->attention_output_weight.scale_inter,
-                                      attention_weights->attention_output_weight.scale_out,
-                                      output_tensors->at("hidden_features").getPtr<T>(),
-                                      m,
-                                      hidden_units_,
-                                      local_hidden_units_,
-                                      nullptr,
-                                      0,
-                                      stream_);
-            }
-            else {
-                cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                                      CUBLAS_OP_N,
-                                      hidden_units_,
-                                      m,
-                                      local_hidden_units_,
-                                      attention_weights->attention_output_weight.kernel,
-                                      hidden_units_,
-                                      qkv_buf_3_,
-                                      local_hidden_units_,
-                                      attention_out,
-                                      hidden_units_);
-            }
-        }
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              hidden_units_,
+                              m,
+                              hidden_units_,
+                              attention_weights->attention_output_weight.kernel,
+                              hidden_units_,
+                              qkv_buf_3_,
+                              hidden_units_,
+                              attention_out,
+                              hidden_units_);
         POP_RANGE;
     }
 
@@ -420,14 +360,11 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
-    local_head_num_(head_num),
-    local_hidden_units_(local_head_num_ * size_per_head),
     rotary_embedding_dim_(0),
     neox_rotary_style_(false),
     is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
     weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
-    int8_mode_(int8_mode)
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
 {
 }
 
@@ -450,17 +387,14 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
-    local_head_num_(local_head_num),
-    local_hidden_units_(local_head_num_ * size_per_head),
     rotary_embedding_dim_(0),
     neox_rotary_style_(false),
     is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
     weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
-    int8_mode_(int8_mode)
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f));
+    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
 }
 
 template<typename T>
@@ -484,17 +418,14 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
-    local_head_num_(local_head_num),
-    local_hidden_units_(local_head_num_ * size_per_head),
     rotary_embedding_dim_(rotary_embedding_dim),
     neox_rotary_style_(neox_rotary_style),
     is_qk_buf_float_(is_qk_buf_float),
     weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr),
-    int8_mode_(int8_mode)
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(local_head_num_, size_per_head_, sm_, 1.0f));
+    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
 }
 
 template<typename T>
@@ -509,14 +440,11 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionL
     head_num_(attention_layer.head_num_),
     size_per_head_(attention_layer.size_per_head_),
     hidden_units_(attention_layer.hidden_units_),
-    local_head_num_(attention_layer.local_head_num_),
-    local_hidden_units_(attention_layer.local_hidden_units_),
     rotary_embedding_dim_(attention_layer.rotary_embedding_dim_),
     neox_rotary_style_(attention_layer.neox_rotary_style_),
     is_qk_buf_float_(attention_layer.is_qk_buf_float_),
     weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_),
-    int8_fc_runner_(attention_layer.int8_fc_runner_),
-    int8_mode_(attention_layer.int8_mode_)
+    int8_fc_runner_(attention_layer.int8_fc_runner_)
 {
 }
 
@@ -537,54 +465,33 @@ template<typename T>
 void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
-    // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels.
-    const auto type_size = sizeof(T);
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true);
-    k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_;
-    v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_;
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true);
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true);
+    k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len, true);
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
     }
-    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * local_hidden_units_, true);
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, type_size * batch_size * seq_len * local_hidden_units_, true);
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
 
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * local_head_num_ * seq_len * seq_len, true);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
         }
     }
 
-    if (int8_mode_ == 1) {
-        // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
-        // possible memory that would be required by any of the individual gemms.
-        const int max_size    = std::max(hidden_units_, 3 * local_hidden_units_);
-        mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
-        mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
-    }
-
-    if (int8_mode_ == 1) {
-        // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
-        // possible memory that would be required by any of the individual gemms.
-        const int max_size    = std::max(hidden_units_, 3 * local_hidden_units_);
-        mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
-        mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
-    }
-    else if (int8_mode_ == 2) {
-        const int max_size   = std::max(hidden_units_, 3 * local_hidden_units_);
-        int8_gemm_ws_bytes_  = int8_fc_runner_->getWorkspaceSize(batch_size * seq_len, max_size, max_size);
-        int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false);
-    }
     is_allocate_buffer_ = true;
 }
 
@@ -598,6 +505,7 @@ void LLaMAContextAttentionLayer<T>::freeBuffer()
         allocator_->free((void**)(&qk_buf_));
         allocator_->free((void**)(&qkv_buf_2_));
         allocator_->free((void**)(&qkv_buf_3_));
+        allocator_->free((void**)(&decoder_normed_input_));
 
         if (is_qk_buf_float_ == true) {
             allocator_->free((void**)(&qk_buf_float_));
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 6a18d734e..e52fdc0a7 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -32,13 +32,12 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     size_t max_seq_len_    = 0;
 
     // metadata
-    const size_t head_num_;
-    const size_t size_per_head_;
-    const size_t hidden_units_;
-    const size_t local_head_num_;
-    const size_t local_hidden_units_;
-    const size_t rotary_embedding_dim_;
-    const bool   neox_rotary_style_;
+    const size_t           head_num_;
+    const size_t           size_per_head_;
+    const size_t           hidden_units_;
+    const size_t           rotary_embedding_dim_;
+    const bool             neox_rotary_style_;
+    static constexpr float layernorm_eps_ = 1e-6f;
 
     // fmha runner
     int                        sm_ = getSMVersion();
@@ -73,52 +72,48 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     size_t mixed_gemm_ws_bytes_  = 0;
     char*  int8_gemm_workspace_  = nullptr;
     size_t int8_gemm_ws_bytes_   = 0;
-
-    // int8_mode_ == 0 means we don't use any mechanism related to INT8.
-    // int8_mode_ == 1 for weight quantized only gemm for GPT
-    // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
-    const int int8_mode_ = 0;
+    T*     decoder_normed_input_ = nullptr;
 
 public:
     LLaMAContextAttentionLayer(size_t           max_batch_size,
-                             size_t           max_seq_len,
-                             size_t           head_num,
-                             size_t           size_per_head,
-                             cudaStream_t     stream,
-                             cublasMMWrapper* cublas_wrapper,
-                             IAllocator*      allocator,
-                             bool             is_free_buffer_after_forward,
-                             bool             is_qk_buf_float,
-                             bool             sparse    = false,
-                             int              int8_mode = 0);
+                               size_t           max_seq_len,
+                               size_t           head_num,
+                               size_t           size_per_head,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward,
+                               bool             is_qk_buf_float,
+                               bool             sparse    = false,
+                               int              int8_mode = 0);
 
     LLaMAContextAttentionLayer(size_t           max_batch_size,
-                             size_t           max_seq_len,
-                             size_t           head_num,
-                             size_t           size_per_head,
-                             size_t           local_head_num,
-                             cudaStream_t     stream,
-                             cublasMMWrapper* cublas_wrapper,
-                             IAllocator*      allocator,
-                             bool             is_free_buffer_after_forward,
-                             bool             is_qk_buf_float,
-                             bool             sparse    = false,
-                             int              int8_mode = 0);
+                               size_t           max_seq_len,
+                               size_t           head_num,
+                               size_t           size_per_head,
+                               size_t           local_head_num,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward,
+                               bool             is_qk_buf_float,
+                               bool             sparse    = false,
+                               int              int8_mode = 0);
 
     LLaMAContextAttentionLayer(size_t           max_batch_size,
-                             size_t           max_seq_len,
-                             size_t           head_num,
-                             size_t           size_per_head,
-                             size_t           local_head_num,
-                             size_t           rotary_embedding_dim,
-                             bool             neox_rotary_style_,
-                             cudaStream_t     stream,
-                             cublasMMWrapper* cublas_wrapper,
-                             IAllocator*      allocator,
-                             bool             is_free_buffer_after_forward,
-                             bool             is_qk_buf_float,
-                             bool             sparse    = false,
-                             int              int8_mode = 0);
+                               size_t           max_seq_len,
+                               size_t           head_num,
+                               size_t           size_per_head,
+                               size_t           local_head_num,
+                               size_t           rotary_embedding_dim,
+                               bool             neox_rotary_style_,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward,
+                               bool             is_qk_buf_float,
+                               bool             sparse    = false,
+                               int              int8_mode = 0);
 
     LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer);
 
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index fb8eb4f9c..3b52fe2e1 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -51,61 +51,37 @@ void LLaMA<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMA<T>::allocateBuffer(
-    size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
+void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t batchxbeam = batch_size * beam_width;
     const size_t self_cache_size =
-        (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len * hidden_units_;
+        (num_layer_ / pipeline_para_.world_size_) * batch_size * max_cache_seq_len * hidden_units_;
 
     input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
-    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+        input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false));
     decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
-    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false));
-    nccl_logits_buf_ =
-        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_, false));
-    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
-    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
-    h_finished_buf_   = new bool[batchxbeam];
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
+    logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * vocab_size_, false));
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batch_size, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
     value_cache_ = key_cache_ + self_cache_size;
-    if (beam_width > 1) {
-        cache_indirections_[0] =
-            (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true));
-        cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len;
-    }
 
     tiled_input_ids_buf_ =
-        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
-    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
-    tiled_total_padding_count_ =
-        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
+        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, true));
+    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, true));
 
     transposed_output_ids_buf_ =
-        (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
-    output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
-    parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
-    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
-    masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true));
+        (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true));
+    output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true));
 
     start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
     end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
-        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+        context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
-        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
-    output_log_probs_buf_ =
-        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
-
-    generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
+        context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
 
     is_allocate_buffer_ = true;
 }
@@ -115,14 +91,8 @@ void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
-        allocator_->free((void**)(&decoder_input_buf_));
         allocator_->free((void**)(&decoder_output_buf_));
-        allocator_->free((void**)(&normed_decoder_output_buf_));
         allocator_->free((void**)(&logits_buf_));
-        allocator_->free((void**)(&nccl_logits_buf_));
-        allocator_->free((void**)(&cum_log_probs_));
-        allocator_->free((void**)(&finished_buf_));
-        delete[] h_finished_buf_;
         allocator_->free((void**)(&sequence_lengths_));
 
         allocator_->free((void**)(&key_cache_));
@@ -132,22 +102,14 @@ void LLaMA<T>::freeBuffer()
 
         allocator_->free((void**)(&tiled_input_ids_buf_));
         allocator_->free((void**)(&tiled_input_lengths_buf_));
-        allocator_->free((void**)(&tiled_total_padding_count_));
 
         allocator_->free((void**)(&transposed_output_ids_buf_));
         allocator_->free((void**)(&output_ids_buf_));
-        allocator_->free((void**)(&parent_ids_buf_));
-        allocator_->free((void**)(&seq_limit_len_));
-        allocator_->free((void**)(&masked_tokens_));
-
         allocator_->free((void**)(&start_ids_buf_));
         allocator_->free((void**)(&end_ids_buf_));
 
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
-        allocator_->free((void**)(&output_log_probs_buf_));
-
-        allocator_->free((void**)(&generation_should_stop_), true);
 
         is_allocate_buffer_ = false;
     }
@@ -160,8 +122,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                 size_t                              num_layer,
                 size_t                              vocab_size,
                 size_t                              rotary_embedding_dim,
-                int                                 start_id,
-                int                                 end_id,
                 unsigned long long                  random_seed,
                 cudaStream_t                        stream,
                 cublasMMWrapper*                    cublas_wrapper,
@@ -178,8 +138,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
-    start_id_(start_id),
-    end_id_(end_id),
     hidden_units_(head_num * size_per_head),
     attention_type_(attention_type)
 {
@@ -195,8 +153,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                 size_t                              num_layer,
                 size_t                              vocab_size,
                 size_t                              rotary_embedding_dim,
-                int                                 start_id,
-                int                                 end_id,
                 unsigned long long                  random_seed,
                 NcclParam                           tensor_para,
                 NcclParam                           pipeline_para,
@@ -215,8 +171,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
-    start_id_(start_id),
-    end_id_(end_id),
     hidden_units_(head_num * size_per_head),
     pipeline_para_(pipeline_para),
     custom_all_reduce_comm_(custom_all_reduce_comm),
@@ -235,8 +189,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     num_layer_(llama.num_layer_),
     vocab_size_(llama.vocab_size_),
     rotary_embedding_dim_(llama.rotary_embedding_dim_),
-    start_id_(llama.start_id_),
-    end_id_(llama.end_id_),
     hidden_units_(llama.hidden_units_),
     pipeline_para_(llama.pipeline_para_),
     custom_all_reduce_comm_(llama.custom_all_reduce_comm_),
@@ -284,20 +236,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      input_ids [batch_size, max_input_length]
     //      input_lengths [batch_size]
     //      output_seq_len [batch_size] on cpu
-    //      start_id [batch_size] on cpu, optional
-    //      end_id [batch_size] on cpu, optional
-    //      stop_words_list [batch_size, 2, stop_words_length], optional
-    //      bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
     //      min_length [1] or [batch_size] on cpu, optional, int
     //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
 
     // output_tensors:
-    //      output_ids [batch_size, beam_width, max_output_seq_len]
-    //      sequence_length [batch_size, beam_width]
-    //      output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*.
-    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
-    //      cum_log_probs [batch_size, beam], optional, must be float*.
-    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
+    //      output_ids [batch_size, 1, max_output_seq_len]
+    //      sequence_length [batch_size]
 
     // Step is from max_input_length ~ max_output_seq_len,
     // When step = k,  we put output ids and caches at step k, and the sequence_length would be k - 1 before
@@ -312,21 +256,19 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end()
              && input_tensors->at("output_seq_len").shape.size() == 1);
     FT_CHECK(output_tensors->at("output_ids").shape.size() == 3);
-    FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2);
+    FT_CHECK(output_tensors->at("sequence_length").shape.size() == 1);
     FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0],
                        "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]");
 
     const size_t batch_size = output_tensors->at("output_ids").shape[0];
-    const size_t beam_width = output_tensors->at("output_ids").shape[1];
 
     // NOTE: Prefix Prompt PreProcessing
-    // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
+    // get prefix_prompt_weight for each batch --> shape [batch, 1]
     // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
     int max_input_length = input_tensors->at("input_ids").shape[1];
 
     // Prefix Soft Prompt
-    const size_t limit_len_offset   = (max_input_length == 0 ? 1 : 0);
-    const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>() + limit_len_offset;
+    const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>();
     const size_t max_seq_len        = max_output_seq_len;
     // max cache seq len should include max prefix prompt length as it has k/v states
     const size_t max_cache_seq_len = max_output_seq_len;
@@ -343,161 +285,122 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        max_seq_len);
     }
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
-    allocateBuffer(batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length);
-    setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
+    allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length);
 
     sync_check_cuda_error();
 
     const DataType            data_type          = getTensorType<T>();
     const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
-                                                    batch_size * beam_width,
+                                                    batch_size,
                                                     head_num_,
                                                     size_per_head_ / (16 / sizeof(T)),
                                                     max_cache_seq_len,
                                                     16 / sizeof(T)};
     const std::vector<size_t> self_v_cache_shape = {
-        num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, head_num_, max_cache_seq_len, size_per_head_};
+        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_};
 
     // initialize the output ids and parent ids
-    cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
-    cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
-    cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_);
-    cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_);
-    if (beam_width > 1) {
-        cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
-    }
-
+    cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * max_seq_len, stream_);
     sync_check_cuda_error();
 
     // handle first step
-    if (max_input_length > 1) {
-        invokeTileGptInputs(tiled_input_ids_buf_,
-                            tiled_input_lengths_buf_,
-                            input_tensors->at("input_ids").getPtr<int>(),
-                            input_tensors->at("input_lengths").getPtr<const int>(),
-                            batch_size,
-                            beam_width,
-                            max_input_length,
-                            stream_);
-        sync_check_cuda_error();
-
-        invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                                                 output_ids_buf_,
-                                                 llama_weights->pre_decoder_embedding_table,
-                                                 llama_weights->position_encoding_table,
-                                                 pPromptTuningParam<T>{},  // no p/prompt tuning
-                                                 tiled_input_ids_buf_,
-                                                 1,
-                                                 max_input_length,
-                                                 max_input_length,
-                                                 batch_size * beam_width,
-                                                 hidden_units_,
-                                                 stream_);
-        sync_check_cuda_error();
-
-        invokeBuildDecoderAttentionMask(input_attention_mask_,
-                                        tiled_input_lengths_buf_,
-                                        nullptr,
-                                        batch_size * beam_width,
-                                        max_input_length,
-                                        0,
-                                        stream_);
-        sync_check_cuda_error();
-
-        std::unordered_map<std::string, Tensor> decoder_input_tensors{
-            {"decoder_input",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
-                    context_decoder_input_buf_}},
-            {"attention_mask",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {batch_size * beam_width, 1, (size_t)max_input_length, (size_t)(max_input_length)},
-                    input_attention_mask_}},
-            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}};
-
-        std::unordered_map<std::string, Tensor> decoder_output_tensors{
-            {"decoder_output",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
-                    context_decoder_output_buf_}},
-            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
-            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}},
-            {"last_token_hidden_units",
-             Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
-
-        llama_context_decoder_->forward(
-            &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
-        sync_check_cuda_error();
-        invokeDecodingInitialize(finished_buf_,
-                                 sequence_lengths_,
-                                 nullptr,
-                                 cum_log_probs_,
-                                 start_ids_buf_,
-                                 batch_size,
-                                 beam_width,
-                                 max_input_length - 1,
-                                 stream_);
-        sync_check_cuda_error();
-    }
-    else if (max_input_length == 0) {
-        max_input_length++;
-        invokeDecodingInitialize(finished_buf_,
-                                 sequence_lengths_,
-                                 output_ids_buf_,
-                                 cum_log_probs_,
-                                 start_ids_buf_,
-                                 batch_size,
-                                 beam_width,
-                                 max_input_length - 1,
-                                 stream_);
-        std::vector<int> h_input_lengths(batch_size * beam_width, 1);
-        cudaMemcpyAsync(tiled_input_lengths_buf_,
-                        h_input_lengths.data(),
-                        sizeof(int) * batch_size * beam_width,
-                        cudaMemcpyHostToDevice,
+    invokeTileGptInputs(tiled_input_ids_buf_,
+                        tiled_input_lengths_buf_,
+                        input_tensors->at("input_ids").getPtr<int>(),
+                        input_tensors->at("input_lengths").getPtr<const int>(),
+                        batch_size,
+                        1,
+                        max_input_length,
                         stream_);
-        sync_check_cuda_error();
-    }
-    else if (max_input_length == 1) {
-        invokeDecodingInitialize(finished_buf_,
-                                 sequence_lengths_,
-                                 nullptr,
-                                 cum_log_probs_,
-                                 start_ids_buf_,
-                                 batch_size,
-                                 beam_width,
-                                 max_input_length - 1,
-                                 stream_);
-        sync_check_cuda_error();
-        invokeTileGptInputs(tiled_input_ids_buf_,
-                            tiled_input_lengths_buf_,
-                            input_tensors->at("input_ids").getPtr<int>(),
-                            input_tensors->at("input_lengths").getPtr<const int>(),
-                            batch_size,
-                            beam_width,
-                            max_input_length,
-                            stream_);
-        sync_check_cuda_error();
+    sync_check_cuda_error();
 
-        cudaMemcpyAsync(output_ids_buf_,
-                        tiled_input_ids_buf_,
-                        sizeof(int) * batch_size * beam_width,
-                        cudaMemcpyDeviceToDevice,
-                        stream_);
-    }
+    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
+                                             output_ids_buf_,
+                                             llama_weights->pre_decoder_embedding_table,
+                                             llama_weights->position_encoding_table,
+                                             pPromptTuningParam<T>{},  // no p/prompt tuning
+                                             tiled_input_ids_buf_,
+                                             1,
+                                             max_input_length,
+                                             max_input_length,
+                                             batch_size,
+                                             hidden_units_,
+                                             stream_);
+    sync_check_cuda_error();
+//    if (pipeline_para_.rank_ == 0) {
+//        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_);
+//        cudaMemcpy(out,
+//                   context_decoder_input_buf_,
+//                   sizeof(T) * batch_size * max_input_length * hidden_units_,
+//                   cudaMemcpyDeviceToHost);
+//        sync_check_cuda_error();
+//
+//        for (int b = 0; b < batch_size; ++b) {
+//            std::cout << "[";
+//            for (int s = 0; s < max_input_length; ++s) {
+//                std::cout << "[";
+//                for (int h = 0; h < 8; ++h) {
+//                    std::cout << out[b * batch_size * hidden_units_  + s * hidden_units_ + h] << " ";
+//                }
+//                std::cout << "]\n";
+//            }
+//            std::cout << "]\n";
+//        }
+//        std::cout << "\n";
+//    }
+
+    invokeBuildDecoderAttentionMask(
+        input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_);
+    sync_check_cuda_error();
 
-    invokeMaskPaddingTokens(masked_tokens_,
-                            input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
-                            nullptr,
-                            max_cache_seq_len,
-                            max_input_length,
-                            0,
-                            batch_size,
-                            beam_width,
-                            stream_);
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input",
+         Tensor{
+             MEMORY_GPU, data_type, {batch_size, (size_t)max_input_length, hidden_units_}, context_decoder_input_buf_}},
+        {"attention_mask",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {batch_size, 1, (size_t)max_input_length, (size_t)(max_input_length)},
+                input_attention_mask_}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}};
+
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {batch_size, (size_t)max_input_length, hidden_units_},
+                context_decoder_output_buf_}},
+        {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
+        {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}},
+        {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size, hidden_units_}, decoder_output_buf_}}};
+
+    llama_context_decoder_->forward(
+        &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
+    sync_check_cuda_error();
+
+    //    invokeGeneralLLaMALayerNorm(
+    //            context_decoder_input_buf_,
+    //            embedding_input_buf_,
+    //            llama_weights->post_decoder_layernorm.gamma,
+    //            llama_weights->post_decoder_layernorm.beta,
+    //            layernorm_eps_,
+    //            batch_size * max_input_length,
+    //            hidden_units_,
+    //            stream_);
+    //    sync_check_cuda_error();
+    //
+    //    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+    //                          CUBLAS_OP_N,
+    //                          batch_size * max_input_length,
+    //                          vocab_size_,
+    //                          hidden_units_,
+    //                          context_decoder_output_buf_,
+    //                          hidden_units_,  // n
+    //                          llama_weights->post_decoder_embedding.kernel,
+    //                          vocab_size_,  // k
+    //                          /* FIXME */,
+    //                          hidden_units_ /* n */);
+    //    sync_check_cuda_error();
 
     setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
     sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
@@ -551,43 +454,19 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
     }
 
     const size_t batch_size       = output_tensors->at("output_ids").shape[0];
-    const size_t beam_width       = output_tensors->at("output_ids").shape[1];
     uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
 
     if (input_tensors->at("input_ids").shape[1] == 0) {
         invokeCudaD2DcpyConvert(
             sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
         // TODO: D2D sequence_lenghts
-        if (beam_width > 1) {
-            // For beam search, do gather_tree
-            // take output_parent_ids as inter buffer
-            invokeGatherTree(transposed_output_ids_buf_,
-                             sequence_lengths_,
-                             max_output_seq_len,
-                             batch_size,
-                             beam_width,
-                             output_ids_buf_ + batch_size * beam_width,
-                             parent_ids_buf_ + batch_size * beam_width,
-                             end_ids_buf_,
-                             stream_);
-
-            // transpose and take output_parent_ids as inter buffer
-            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
-                                  transposed_output_ids_buf_,
-                                  max_output_seq_len - 1,
-                                  batch_size * beam_width,
-                                  1,
-                                  stream_);
-        }
-        else {
-            // For sampling, only copy the results to output_tensor
-            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
-                                  output_ids_buf_ + batch_size * beam_width,
-                                  max_output_seq_len - 1,
-                                  batch_size * beam_width,
-                                  1,
-                                  stream_);
-        }
+        // For sampling, only copy the results to output_tensor
+        invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
+                              output_ids_buf_ + batch_size,
+                              max_output_seq_len - 1,
+                              batch_size,
+                              1,
+                              stream_);
     }
     else {
 
@@ -599,9 +478,9 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         param.max_sequence_length_final_step  = 1;
         param.max_time                        = max_output_seq_len;
         param.batch_size                      = batch_size;
-        param.beam_width                      = beam_width;
+        param.beam_width                      = 1;
         param.step_ids                        = output_ids_buf_;
-        param.parent_ids                      = beam_width == 1 ? nullptr : parent_ids_buf_;
+        param.parent_ids                      = nullptr;
         param.end_tokens                      = end_ids_buf_;
         param.max_input_length                = max_input_length;
         param.prefix_soft_prompt_lengths      = nullptr;
@@ -615,21 +494,6 @@ void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
             sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
         sync_check_cuda_error();
     }
-    if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) {
-        invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr<float>(),
-                              output_log_probs_buf_,
-                              input_tensors->at("output_seq_len").max<uint32_t>() - max_input_length,
-                              batch_size * beam_width,
-                              1,
-                              stream_);
-    }
-    // Return the cumulative log probability if requested.
-    if (output_tensors->count("cum_log_probs") > 0) {
-        Tensor cum_log_probs = output_tensors->at("cum_log_probs");
-        FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width,
-                           "The shape of cum_log_probs does not match with batch_size x beam_width.");
-        cudaAutoCpy(cum_log_probs.getPtr<float>(), cum_log_probs_, cum_log_probs.size(), stream_);
-    }
 }
 
 template<typename T>
@@ -644,12 +508,6 @@ size_t LLaMA<T>::getPipelineParallelSize()
     return pipeline_para_.world_size_;
 }
 
-template<typename T>
-bool* LLaMA<T>::getFinishBuffer()
-{
-    return finished_buf_;
-}
-
 template class LLaMA<float>;
 template class LLaMA<half>;
 
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 303236b72..68b7cef4c 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -39,8 +39,6 @@ class LLaMA: public BaseLayer {
     static constexpr bool  neox_rotary_style_ = true;
     static constexpr float layernorm_eps_     = 1e-6f;
 
-    int    start_id_;
-    int    end_id_;
     size_t hidden_units_;
 
     NcclParam tensor_para_;
@@ -58,27 +56,18 @@ class LLaMA: public BaseLayer {
 
     void allocateBuffer() override;
     void allocateBuffer(
-        size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
+        size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
     void freeBuffer() override;
 
     void initialize();
 
 protected:
     T* input_attention_mask_;
-
-    T* decoder_input_buf_;
     T* decoder_output_buf_;
-    T* normed_decoder_output_buf_;
 
     float* logits_buf_;
-    float* nccl_logits_buf_;
-    float* cum_log_probs_;
 
-    bool*     finished_buf_;
-    bool*     h_finished_buf_;
     int*      sequence_lengths_          = nullptr;
-    int*      tiled_total_padding_count_ = nullptr;
-    uint32_t* seq_limit_len_             = nullptr;
 
     T*   key_cache_;
     T*   value_cache_;
@@ -88,16 +77,11 @@ class LLaMA: public BaseLayer {
     int*  tiled_input_lengths_buf_;
     int*  transposed_output_ids_buf_;
     int*  output_ids_buf_;
-    int*  parent_ids_buf_;
     int*  start_ids_buf_;
     int*  end_ids_buf_;
-    bool* masked_tokens_ = nullptr;
-
-    bool* generation_should_stop_ = nullptr;
 
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
-    float* output_log_probs_buf_;
 
     // function pointer callback
     using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
@@ -118,8 +102,6 @@ class LLaMA: public BaseLayer {
           size_t                              num_layer,
           size_t                              vocab_size,
           size_t                              rotary_embedding_dim,
-          int                                 start_id,
-          int                                 end_id,
           unsigned long long                  random_seed,
           cudaStream_t                        stream,
           cublasMMWrapper*                    cublas_wrapper,
@@ -136,8 +118,6 @@ class LLaMA: public BaseLayer {
           size_t                              num_layer,
           size_t                              vocab_size,
           size_t                              rotary_embedding_dim,
-          int                                 start_id,
-          int                                 end_id,
           unsigned long long                  random_seed,
           NcclParam                           tensor_para,
           NcclParam                           pipeline_para,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 119c98041..c373c9d09 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -66,8 +66,6 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 template<typename T>
 void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
 {
-    decoder_normed_input_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     self_attn_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     ffn_output_ = reinterpret_cast<T*>(
@@ -85,7 +83,6 @@ template<typename T>
 void LLaMAContextDecoder<T>::freeBuffer()
 {
     if (is_allocate_buffer_ == true) {
-        allocator_->free((void**)(&decoder_normed_input_));
         allocator_->free((void**)(&self_attn_output_));
         allocator_->free((void**)(&ffn_output_));
         allocator_->free((void**)(&decoder_layer_output_));
@@ -220,7 +217,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
     //      last_token_hidden_units [batch_size, hidden_dimension]
 
-    // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'.
+    // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'.
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
@@ -238,20 +235,15 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     T*       decoder_output = output_tensors->at("decoder_output").getPtr<T>();
     const T* attention_mask = input_tensors->at("attention_mask").getPtr<const T>();
 
-    // const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
-    const int local_batch_size = batch_size;
-    FT_CHECK(batch_size % local_batch_size == 0);
-    const int iteration_num = batch_size / local_batch_size;
-
     Tensor&             k_cache = output_tensors->at("key_cache");
     Tensor&             v_cache = output_tensors->at("value_cache");
     std::vector<size_t> self_k_cache_size;
-    self_k_cache_size.push_back(local_batch_size);
+    self_k_cache_size.push_back(batch_size);
     for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
         self_k_cache_size.push_back(*t);
     }
     std::vector<size_t> self_v_cache_size;
-    self_v_cache_size.push_back(local_batch_size);
+    self_v_cache_size.push_back(batch_size);
     for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
         self_v_cache_size.push_back(*t);
     }
@@ -259,158 +251,136 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     AttentionType attention_type  = attention_type_;
     const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
 
-    for (int ite = 0; ite < iteration_num; ite++) {
-        size_t h_token_num = local_batch_size * seq_len;
-        if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
-            invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
-                                               &h_token_num,
-                                               padding_offset_,
-                                               cu_seqlens_,
-                                               base_input_lengths + ite * local_batch_size,
-                                               local_batch_size,
-                                               seq_len,
-                                               stream_);
+    size_t h_token_num = batch_size * seq_len;
+    if (is_unpadded_mha) {
+        const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+        invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                           &h_token_num,
+                                           padding_offset_,
+                                           cu_seqlens_,
+                                           base_input_lengths,
+                                           batch_size,
+                                           seq_len,
+                                           stream_);
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l) == false) {
+            continue;
         }
-        for (int l = 0; l < num_layer_; l++) {
-            if (isValidLayerParallelId(l) == false) {
-                continue;
-            }
 
-            if (l == 0 && is_unpadded_mha) {
-                invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
-                                    padding_offset_,
-                                    h_token_num,
-                                    hidden_units_,
-                                    stream_);
-            }
+        if (l == 0 && is_unpadded_mha) {
+            invokeRemovePadding(
+                decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
+        }
 
-            const bool is_final     = false;  // TODO(bhsueh) remove this flag
-            T*         layer_input  = decoder_layer_output_;
-            T*         layer_output = decoder_layer_output_;
-            if (!is_unpadded_mha) {
-                if (l == 0) {
-                    layer_input = decoder_input;
-                    layer_input += ite * local_batch_size * seq_len * hidden_units_;
-                }
-                if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
-                    layer_output += ite * local_batch_size * seq_len * hidden_units_;
-                }
+        const bool is_final     = false;  // TODO(bhsueh) remove this flag
+        T*         layer_input  = decoder_layer_output_;
+        T*         layer_output = decoder_layer_output_;
+        if (!is_unpadded_mha) {
+            if (l == 0) {
+                layer_input = decoder_input;
             }
-
-            if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
-                int data_size = h_token_num * hidden_units_;
-                std::cout << __FILE__ << ":" << __LINE__ << "\n";
-                std::cout << "Recv: " << layer_output << "," << data_size << "\n";
-                ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+            if (l == num_layer_ - 1) {
+                layer_output = decoder_output;
             }
+        }
+
+        if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
+            int data_size = h_token_num * hidden_units_;
+            ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+        }
+
+        TensorMap self_attention_input_tensors{
+            {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}},
+            {"attention_mask",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
+                    attention_mask}},
+            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
+            {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
+            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
+            {"pre_layernorm_weights_gamma",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {(size_t)hidden_units_},
+                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma}},
+            {"pre_layernorm_weights_beta",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {(size_t)hidden_units_},
+                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta}}};
+
+        if (is_unpadded_mha) {
+            self_attention_input_tensors.insert("padding_offset",
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_});
+            self_attention_input_tensors.insert("cu_seqlens",
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_});
+        }
+
+        size_t cache_offset = l - getFirstLayerParallelId();
+        for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+            cache_offset *= *t;
+        };
+
+        TensorMap self_attention_output_tensors{
+            {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+        self_attention_layer_->forward(&self_attention_output_tensors,
+                                       &self_attention_input_tensors,
+                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
+
+        if (is_final == false) {
+            invokeGeneralAddBiasResidualPreLayerNorm(
+                self_attn_output_,
+                layer_input,
+                self_attn_output_,
+                layer_input,
+                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+                layernorm_eps_,
+                h_token_num,
+                hidden_units_,
+                (float*)nullptr,
+                (float*)nullptr,
+                (float*)nullptr,
+                (float*)nullptr,
+                0,
+                stream_);
+
+            TensorMap ffn_input_tensors(
+                {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}});
+            TensorMap ffn_output_tensors(
+                {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
+            ffn_layer_->forward(
+                &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
+
+            invokeAddBiasResidual(layer_output,
+                                  self_attn_output_,
+                                  llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                  h_token_num,
+                                  hidden_units_,
+                                  stream_);
 
-            invokeGeneralLLaMALayerNorm(decoder_normed_input_,
-                                        layer_input,
-                                        llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                        llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
-                                        layernorm_eps_,
-                                        h_token_num,
-                                        hidden_units_,
-                                        stream_);
             sync_check_cuda_error();
 
-            TensorMap self_attention_input_tensors{
-                {"input_query",
-                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
-                {"attention_mask",
-                 Tensor{MEMORY_GPU,
-                        data_type,
-                        {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
-                {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
-                {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
-                {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
-
-            if (is_unpadded_mha) {
-                self_attention_input_tensors.insert("padding_offset",
-                                                    Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_});
-                self_attention_input_tensors.insert(
-                    "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_});
+            if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+                && pipeline_para_.world_size_ > 1) {
+                int data_size = h_token_num * hidden_units_;
+                ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
             }
 
-            size_t cache_offset = l - getFirstLayerParallelId();
-            for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
-                cache_offset *= *t;
-            };
-            size_t ite_cache_offset = ite * local_batch_size;
-            for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
-                ite_cache_offset *= *t;
-            }
-            cache_offset += ite_cache_offset;
-
-            TensorMap self_attention_output_tensors{
-                {"hidden_features",
-                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
-
-            self_attention_layer_->forward(&self_attention_output_tensors,
-                                           &self_attention_input_tensors,
-                                           &llama_decoder_layer_weight->at(l)->self_attention_weights);
-
-            if (is_final == false) {
-                invokeGeneralAddBiasResidualPreLayerNorm(
-                    self_attn_output_,
-                    decoder_normed_input_,
-                    self_attn_output_,
-                    layer_input,
-                    llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                    llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                    llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
-                    layernorm_eps_,
-                    h_token_num,
-                    hidden_units_,
-                    (float*)nullptr,
-                    (float*)nullptr,
-                    (float*)nullptr,
-                    (float*)nullptr,
-                    0,
-                    stream_);
-
-                TensorMap ffn_input_tensors(
-                    {{"ffn_input",
-                      Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
-                TensorMap ffn_output_tensors(
-                    {{"ffn_output",
-                      Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
-                ffn_layer_->forward(
-                    &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
-
-                invokeAddBiasResidual(layer_output,
-                                      self_attn_output_,
-                                      llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                      h_token_num,
-                                      hidden_units_,
-                                      stream_);
-
-                sync_check_cuda_error();
-
-                if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
-                    && pipeline_para_.world_size_ > 1) {
-                    int data_size = h_token_num * hidden_units_;
-                    std::cout << __FILE__ << ":" << __LINE__ << "\n";
-                    std::cout << "Send: " << layer_output << "," << data_size << "\n";
-                    ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
-                    std::cout << __FILE__ << ":" << __LINE__ << "\n";
-                }
-
-                if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
-                                         decoder_layer_output_,
-                                         padding_offset_,
-                                         h_token_num,
-                                         head_num_ * size_per_head_,
-                                         stream_);
-                }
+            if ((l == num_layer_ - 1) && is_unpadded_mha) {
+                invokeRebuildPadding(decoder_output,
+                                     decoder_layer_output_,
+                                     padding_offset_,
+                                     h_token_num,
+                                     head_num_ * size_per_head_,
+                                     stream_);
             }
         }
     }
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
index f7081de11..f1c51e340 100644
--- a/src/fastertransformer/models/llama/LLaMAWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -124,7 +124,6 @@ void LLaMAWeight<T>::setWeightPtr()
 {
     pre_decoder_embedding_table   = weights_ptr[0];
     post_decoder_layernorm.beta   = weights_ptr[1];
-    post_decoder_layernorm.beta   = nullptr;
     post_decoder_layernorm.gamma  = weights_ptr[2];
     post_decoder_embedding.kernel = weights_ptr[3];
 }

From 81dc94ad9c71038b94a8ccf4eaf133ac8f6e8631 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 16 Sep 2023 15:05:17 +0000
Subject: [PATCH 12/55] dump

---
 examples/cpp/llama/llama_config.ini           |  1 +
 examples/cpp/llama/llama_example.cc           |  7 ++-
 .../kernels/unfused_attention_kernels.cu      |  3 ++
 .../LLaMAContextAttentionLayer.cc             | 38 ++++++++++++++--
 src/fastertransformer/models/llama/LLaMA.cc   | 45 ++++++++++---------
 src/fastertransformer/models/llama/LLaMA.h    |  2 +-
 6 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 9cb766533..1e92695e5 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -16,4 +16,5 @@ vocab_size=32000
 decoder_layers=60
 rotary_embedding=128
 multiple_of=256
+max_cache_seq_len=1024
 padding_id=0
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 2955cbb14..ebdf7cb9e 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,13 +81,14 @@ void llama_example(const INIReader reader)
     const size_t decoder_layers       = reader.GetInteger(model_name, "decoder_layers");
     const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
     const int    multiple_of          = reader.GetInteger(model_name, "multiple_of");
+    const size_t max_cache_seq_len    = reader.GetInteger(model_name, "max_cache_seq_len");
 
     const size_t hidden_units = head_num * size_per_head;
     const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of);
 
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
     const int    min_length         = reader.GetInteger("request", "min_length", 0);
-    const int    padding_id           = reader.GetInteger(model_name, "padding_id");
+    const int    padding_id         = reader.GetInteger(model_name, "padding_id");
 
     FT_CHECK(decoder_layers % pipeline_para_size == 0);
 
@@ -224,7 +225,9 @@ void llama_example(const INIReader reader)
         {"output_seq_len",
          Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
         {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
-        {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}}};
+        {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}},
+        {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &max_cache_seq_len}}
+    };
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
         {"output_ids",
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index d0fb0a197..b2f7d7809 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1407,6 +1407,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
     const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
 
+    if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) 
+    printf("is_masked: %d, do_rotary: %d\n", is_masked, do_rotary);
+
     Vec_t q, k, v;
     Vec_t q_bias, k_bias, v_bias;
     if (!is_masked) {
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 8837acb82..2297b9999 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -160,11 +160,43 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    0,  // int8_mode
                                    stream_);
     sync_check_cuda_error();
+    if (layer_id == 0) {
+        // shape: [B, H, L, Dh]
+        T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_);
+        T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_;
+        T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_;
+        cudaMemcpy(q_buf,
+                   q_buf_2_,
+                   sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_,
+                   cudaMemcpyDeviceToHost);
+        sync_check_cuda_error();
+
+        for (int b = 0; b < request_batch_size; ++b) {
+            std::cout << "[";
+            for (int h = 0; h < head_num_; ++h) {
+                std::cout << "[";
+                for (int s = 0; s < request_seq_len; ++s) {
+                    std::cout << "[";
+                    for (int e = 0; e < 8; ++e) {
+                        std::cout << k_buf[b * head_num_ * request_seq_len * size_per_head_
+                                           + h * request_seq_len * size_per_head_
+                                           + s * size_per_head_
+                                           + e]
+                                  << " ";
+                    }
+                    std::cout << "]\n";
+                }
+                std::cout << "]\n";
+            }
+            std::cout << "]\n";
+        }
+        std::cout << "\n";
+    }
 
     const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     // Use batch major
-    // put k/v_buf from shape [B, H, PL + L, Dh]
-    // to cache [B, H, Dh/x, PL + L, x]  and [B, H, PL + L, Dh/x, x], PL denotes prompt length
+    // put k/v_buf from shape [B, H, L, Dh]
+    // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
     invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
                                 output_tensors->getPtr<T>("value_cache"),
                                 k_buf_2_,
@@ -175,7 +207,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                 size_per_head_,
                                 head_num_,
                                 stream_);
-    // IDEA : after this, 
+    // IDEA : after this,
     // k_cache = (batch_size, num_heads, Dh/x, L, x)
     // v_cache = (batch_size, num_heads, L, Dh)
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 3b52fe2e1..c74dd4663 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -238,6 +238,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      output_seq_len [batch_size] on cpu
     //      min_length [1] or [batch_size] on cpu, optional, int
     //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
+    //      max_cache_seq_len [batch_size] on cpu
 
     // output_tensors:
     //      output_ids [batch_size, 1, max_output_seq_len]
@@ -271,7 +272,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>();
     const size_t max_seq_len        = max_output_seq_len;
     // max cache seq len should include max prefix prompt length as it has k/v states
-    const size_t max_cache_seq_len = max_output_seq_len;
+    const size_t max_cache_seq_len = input_tensors->at("max_cache_seq_len").max<uint32_t>();
     if (max_cache_seq_len < max_seq_len) {
         FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). "
                        "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.",
@@ -327,27 +328,27 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                              hidden_units_,
                                              stream_);
     sync_check_cuda_error();
-//    if (pipeline_para_.rank_ == 0) {
-//        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_);
-//        cudaMemcpy(out,
-//                   context_decoder_input_buf_,
-//                   sizeof(T) * batch_size * max_input_length * hidden_units_,
-//                   cudaMemcpyDeviceToHost);
-//        sync_check_cuda_error();
-//
-//        for (int b = 0; b < batch_size; ++b) {
-//            std::cout << "[";
-//            for (int s = 0; s < max_input_length; ++s) {
-//                std::cout << "[";
-//                for (int h = 0; h < 8; ++h) {
-//                    std::cout << out[b * batch_size * hidden_units_  + s * hidden_units_ + h] << " ";
-//                }
-//                std::cout << "]\n";
-//            }
-//            std::cout << "]\n";
-//        }
-//        std::cout << "\n";
-//    }
+    //    if (pipeline_para_.rank_ == 0) {
+    //        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_);
+    //        cudaMemcpy(out,
+    //                   context_decoder_input_buf_,
+    //                   sizeof(T) * batch_size * max_input_length * hidden_units_,
+    //                   cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < batch_size; ++b) {
+    //            std::cout << "[";
+    //            for (int s = 0; s < max_input_length; ++s) {
+    //                std::cout << "[";
+    //                for (int h = 0; h < 8; ++h) {
+    //                    std::cout << out[b * batch_size * hidden_units_  + s * hidden_units_ + h] << " ";
+    //                }
+    //                std::cout << "]\n";
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
 
     invokeBuildDecoderAttentionMask(
         input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_);
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 68b7cef4c..386a09cd4 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -36,7 +36,7 @@ class LLaMA: public BaseLayer {
     size_t vocab_size_;
     size_t rotary_embedding_dim_;
 
-    static constexpr bool  neox_rotary_style_ = true;
+    static constexpr bool  neox_rotary_style_ = false;
     static constexpr float layernorm_eps_     = 1e-6f;
 
     size_t hidden_units_;

From d5b2c12b846bb8c4cc4336cd4656d3233af798d3 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 16 Sep 2023 15:07:40 +0000
Subject: [PATCH 13/55] for junsik

---
 src/fastertransformer/kernels/unfused_attention_kernels.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index b2f7d7809..8d4c5e6da 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1407,8 +1407,6 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
     const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
 
-    if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) 
-    printf("is_masked: %d, do_rotary: %d\n", is_masked, do_rotary);
 
     Vec_t q, k, v;
     Vec_t q_bias, k_bias, v_bias;

From 8ec39b5f19ef9abe82be25cc5afe9629e6637b3e Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 16 Sep 2023 20:25:38 +0000
Subject: [PATCH 14/55] first success

---
 examples/cpp/llama/llama_example.cc           |   2 +-
 .../kernels/layernorm_kernels.cu              | 663 +++++++++++++++++-
 .../kernels/layernorm_kernels.h               |  14 +
 .../kernels/unfused_attention_kernels.cu      |   3 +-
 .../LLaMAContextAttentionLayer.cc             | 435 ++++++------
 .../LLaMAContextAttentionLayer.h              |   2 -
 src/fastertransformer/models/llama/LLaMA.cc   |  70 +-
 src/fastertransformer/models/llama/LLaMA.h    |   1 +
 .../models/llama/LLaMAContextDecoder.cc       | 235 +++++--
 .../models/llama/LLaMADecoderLayerWeight.cc   |  12 +-
 10 files changed, 1109 insertions(+), 328 deletions(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index ebdf7cb9e..43f55c4b7 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -84,7 +84,7 @@ void llama_example(const INIReader reader)
     const size_t max_cache_seq_len    = reader.GetInteger(model_name, "max_cache_seq_len");
 
     const size_t hidden_units = head_num * size_per_head;
-    const size_t inter_size   = multiple_of * ((2 * hidden_units + multiple_of - 1) / multiple_of);
+    const size_t inter_size   = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of);
 
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
     const int    min_length         = reader.GetInteger("request", "min_length", 0);
diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
index b19e9ac73..6244dbfd6 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.cu
+++ b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -19,6 +19,237 @@
 #include "src/fastertransformer/utils/cuda_type_utils.cuh"
 
 namespace fastertransformer {
+// __global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output,
+// __global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output,
+// __global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input,
+
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
+__global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output,
+                                                        T* output,
+                                                        const T* __restrict input,
+                                                        const T* __restrict bias,
+                                                        const T* __restrict residual1,
+                                                        const T* __restrict residual2,
+                                                        const T* __restrict gamma,
+                                                        const T* __restrict beta,
+                                                        const float layernorm_eps,
+                                                        int         m,
+                                                        int         n)
+{
+    extern __shared__ __align__(sizeof(float)) char _shmem[];  // Align on largest type
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    __shared__ float s_variance;
+    float            variance = 0.0f;
+
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    T local_sum = cuda_cast<T>(0.0f);
+
+    const Float_Packed_T scale_from_int = cuda_cast<Float_Packed_T>(0.0f);
+    const Float_Packed_T scale_to_int   = cuda_cast<Float_Packed_T>(0.0f);
+
+#pragma unroll
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = cuda_cast<T>(0.0f);
+
+        if (IS_BIAS) {
+            val = hadd2(val, ldg(&bias[i]));
+        }
+        if (RESIDUAL_NUM == 1) {
+            val = hadd2(val, ldg(&residual1[index]));
+        }
+        else if (RESIDUAL_NUM == 2) {
+            val = hadd2(hadd2(val, ldg(&residual1[index])), ldg(&residual2[index]));
+        }
+
+        if (IS_OUTPUT) {
+            T in_val;
+            in_val = input[index];
+            val    = hadd2(val, in_val);
+        }
+        shmem[i]      = val;
+        output[index] = val;
+        local_sum     = hadd2(local_sum, val);
+    }
+
+    float local_var_sum = 0.0f;
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        T     val    = input[blockIdx.x * n + i];
+        float diff_1 = (float)(val.x);
+        float diff_2 = (float)(val.y);
+        local_var_sum += (diff_1 * diff_1 + diff_2 * diff_2);
+    }
+    variance = blockReduceSum(local_var_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n / 2 + layernorm_eps);
+    }
+    __syncthreads();
+
+    T var_2 = cuda_cast<T>(s_variance);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = hmul2(shmem[i], var_2, ldg(&gamma[i]));
+        if (IS_BETA) {
+            val = hadd2(val, ldg(&beta[i]));
+        }
+
+        normed_output[index] = val;
+    }
+}
+
+// * Note that typename T is half2 or bfloat2 type
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
+__global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output,
+                                                         T* output,
+                                                         const T* __restrict input,
+                                                         const T* __restrict bias,
+                                                         const T* __restrict residual1,
+                                                         const T* __restrict residual2,
+                                                         const T* __restrict gamma,
+                                                         const T* __restrict beta,
+                                                         const float layernorm_eps,
+                                                         int         m,
+                                                         int         n)
+{
+    extern __shared__ __align__(sizeof(float)) char _shmem[];
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    __shared__ float s_variance;
+    float            x2_sum   = 0.0f;
+    const int        b_offset = blockIdx.x * n;
+
+    using T1             = typename TypeConverter<T>::Type;
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    const Float_Packed_T scale_vec_in = cuda_cast<Float_Packed_T>(0.0f);
+    const Float_Packed_T scale_vec    = cuda_cast<Float_Packed_T>(0.0f);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = b_offset + i;
+        float     val_1 = 0.0f;
+        float     val_2 = 0.0f;
+        T         tmp;
+
+        if (IS_BIAS) {
+            tmp = ldg(&bias[i]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        if (RESIDUAL_NUM == 1) {
+            tmp = ldg(&residual1[index]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        else if (RESIDUAL_NUM == 2) {
+            tmp    = ldg(&residual1[index]);
+            T tmp2 = ldg(&residual2[index]);
+            val_1 += (static_cast<float>(tmp.x) + static_cast<float>(tmp2.x));
+            val_2 += (static_cast<float>(tmp.y) + static_cast<float>(tmp2.y));
+        }
+
+        if (IS_OUTPUT) {
+            tmp = ldg(&input[index]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        tmp.x         = cuda_cast<T1>(val_1);
+        tmp.y         = cuda_cast<T1>(val_2);
+        shmem[i]      = tmp;
+        output[index] = tmp;
+        x2_sum += val_1 * val_1 + val_2 * val_2;
+    }
+    float sum_sq = blockReduceSum(x2_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(sum_sq / n / 2 + layernorm_eps);
+    }
+    __syncthreads();
+
+    T var_2 = cuda_cast<T>(s_variance);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = hmul2(shmem[i], var_2, ldg(&gamma[i]));
+        if (IS_BETA) {
+            val = hadd2(val, ldg(&beta[i]));
+        }
+
+        normed_output[index] = val;
+    }
+}
+
+template<typename T, int RESIDUAL_NUM>
+__global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input,
+                                                     const T* __restrict residual1,
+                                                     const T* __restrict residual2,
+                                                     const T* __restrict gamma,
+                                                     const T* __restrict beta,
+                                                     const T* __restrict bias,
+                                                     T*          output,
+                                                     T*          norm_output,
+                                                     const float layernorm_eps,
+                                                     int         m,
+                                                     int         n)
+{
+    int tid = threadIdx.x;
+
+    // NOTE: float shmem may exceed the shared memory limit
+    extern __shared__ __align__(sizeof(float)) char _shmem[];
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    __shared__ float s_variance;
+    float            variance  = 0.0f;
+    float            local_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float local_out = 0.0f;
+        if (RESIDUAL_NUM == 1) {
+            local_out = (float)(ldg(&residual1[blockIdx.x * n + i]));
+        }
+        else if (RESIDUAL_NUM == 2) {
+            local_out = (float)(ldg(&residual1[blockIdx.x * n + i])) + float(ldg(&residual2[blockIdx.x * n + i]));
+        }
+        local_out += (float)(input[blockIdx.x * n + i]);
+
+        if (bias != nullptr) {
+            local_out += (float)(ldg(&bias[i]));
+        }
+        shmem[i]                   = (T)local_out;
+        output[blockIdx.x * n + i] = (T)local_out;
+        local_sum += local_out;
+    }
+
+    float local_var_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float diff = (float)(output[blockIdx.x * n + i]);
+        local_var_sum += diff * diff;
+    }
+    variance = blockReduceSum(local_var_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        float       beta_val = (beta == nullptr) ? 0.0f : (float)(ldg(&beta[i]));
+        const float val      = (((float)shmem[i] * s_variance) * (float)(ldg(&gamma[i])) + beta_val);
+
+        norm_output[blockIdx.x * n + i] = (T)val;
+    }
+}
 
 // * Note that typename T is half2 or bfloat2 type
 template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
@@ -841,6 +1072,51 @@ __global__ void generalAddBiasResidualLayerNorm(const T* __restrict input,
     }
 }
 
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version(T*           norm_output,
+                                                                  T*           output,
+                                                                  const T*     input,
+                                                                  const T*     bias,
+                                                                  const T*     residual1,
+                                                                  const T*     residual2,
+                                                                  const T*     gamma,
+                                                                  const T*     beta,
+                                                                  float        layernorm_eps,
+                                                                  int          m,
+                                                                  int          half_n,
+                                                                  dim3         grid,
+                                                                  dim3         block,
+                                                                  cudaStream_t stream,
+                                                                  int          opt_version)
+{
+    size_t maxbytes = half_n * sizeof(T);
+    if (opt_version == 1) {
+        if (maxbytes >= (48 << 10)) {
+            check_cuda_error(cudaFuncSetAttribute(
+                generalLLaMAAddBiasResidualLayerNormOpt<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>,
+                cudaFuncAttributeMaxDynamicSharedMemorySize,
+                maxbytes));
+        }
+        generalLLaMAAddBiasResidualLayerNormOpt<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>
+            <<<grid, block, maxbytes, stream>>>(
+                norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n);
+    }
+    else if (opt_version == 2) {
+        if (maxbytes >= (48 << 10)) {
+            check_cuda_error(cudaFuncSetAttribute(
+                generalLLaMAAddBiasResidualLayerNormOpt2<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>,
+                cudaFuncAttributeMaxDynamicSharedMemorySize,
+                maxbytes));
+        }
+        generalLLaMAAddBiasResidualLayerNormOpt2<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>
+            <<<grid, block, maxbytes, stream>>>(
+                norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n);
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "opt_num must be 1 or 2");
+    }
+}
+
 template<typename T, bool IS_OUTPUT, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T*           norm_output,
                                                              T*           output,
@@ -919,6 +1195,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T*           norm_o
     }
 }
 
+template<typename T, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(T*           norm_output,
+                                                                T*           output,
+                                                                const T*     input,
+                                                                const T*     bias,
+                                                                const T*     residual1,
+                                                                const T*     residual2,
+                                                                const T*     gamma,
+                                                                const T*     beta,
+                                                                float        layernorm_eps,
+                                                                int          m,
+                                                                int          half_n,
+                                                                dim3         grid,
+                                                                dim3         block,
+                                                                cudaStream_t stream,
+                                                                int          opt_version,
+                                                                bool         is_output)
+{
+    if (is_output) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version<T, true, IS_BIAS, UNROLL_FACTOR, RESIDUAL_NUM>(
+            norm_output,
+            output,
+            input,
+            bias,
+            residual1,
+            residual2,
+            gamma,
+            beta,
+            layernorm_eps,
+            m,
+            half_n,
+            grid,
+            block,
+            stream,
+            opt_version);
+    }
+    else {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version<T, false, IS_BIAS, UNROLL_FACTOR, RESIDUAL_NUM>(
+            norm_output,
+            output,
+            input,
+            bias,
+            residual1,
+            residual2,
+            gamma,
+            beta,
+            layernorm_eps,
+            m,
+            half_n,
+            grid,
+            block,
+            stream,
+            opt_version);
+    }
+}
+
 template<typename T, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T*           norm_output,
                                                            T*           output,
@@ -990,6 +1322,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T*           norm_out
     }
 }
 
+template<typename T, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(T*           norm_output,
+                                                           T*           output,
+                                                           const T*     input,
+                                                           const T*     bias,
+                                                           const T*     residual1,
+                                                           const T*     residual2,
+                                                           const T*     gamma,
+                                                           const T*     beta,
+                                                           float        layernorm_eps,
+                                                           int          m,
+                                                           int          half_n,
+                                                           dim3         grid,
+                                                           dim3         block,
+                                                           cudaStream_t stream,
+                                                           int          opt_version,
+                                                           bool         is_output)
+{
+    if (bias != nullptr) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output<T, true, UNROLL_FACTOR, RESIDUAL_NUM>(norm_output,
+                                                                                                         output,
+                                                                                                         input,
+                                                                                                         bias,
+                                                                                                         residual1,
+                                                                                                         residual2,
+                                                                                                         gamma,
+                                                                                                         beta,
+                                                                                                         layernorm_eps,
+                                                                                                         m,
+                                                                                                         half_n,
+                                                                                                         grid,
+                                                                                                         block,
+                                                                                                         stream,
+                                                                                                         opt_version,
+                                                                                                         is_output);
+    }
+    else {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output<T, false, UNROLL_FACTOR, RESIDUAL_NUM>(norm_output,
+                                                                                                          output,
+                                                                                                          input,
+                                                                                                          bias,
+                                                                                                          residual1,
+                                                                                                          residual2,
+                                                                                                          gamma,
+                                                                                                          beta,
+                                                                                                          layernorm_eps,
+                                                                                                          m,
+                                                                                                          half_n,
+                                                                                                          grid,
+                                                                                                          block,
+                                                                                                          stream,
+                                                                                                          opt_version,
+                                                                                                          is_output);
+    }
+}
+
 template<typename T, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_bias(T*           norm_output,
                                                       T*           output,
@@ -1061,6 +1449,66 @@ void dispatch_generalAddBiasResidualLayerNormOpt_bias(T*           norm_output,
     }
 }
 
+template<typename T, int UNROLL_FACTOR>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(T*           norm_output,
+                                                                   T*           output,
+                                                                   const T*     input,
+                                                                   const T*     bias,
+                                                                   const T*     residual1,
+                                                                   const T*     residual2,
+                                                                   const T*     gamma,
+                                                                   const T*     beta,
+                                                                   float        layernorm_eps,
+                                                                   int          m,
+                                                                   int          half_n,
+                                                                   dim3         grid,
+                                                                   dim3         block,
+                                                                   cudaStream_t stream,
+                                                                   int          opt_version,
+                                                                   bool         is_output,
+                                                                   int          residual_num)
+{
+    if (residual_num == 1) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias<T, UNROLL_FACTOR, 1>(norm_output,
+                                                                                   output,
+                                                                                   input,
+                                                                                   bias,
+                                                                                   residual1,
+                                                                                   residual2,
+                                                                                   gamma,
+                                                                                   beta,
+                                                                                   layernorm_eps,
+                                                                                   m,
+                                                                                   half_n,
+                                                                                   grid,
+                                                                                   block,
+                                                                                   stream,
+                                                                                   opt_version,
+                                                                                   is_output);
+    }
+    else if (residual_num == 2) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias<T, UNROLL_FACTOR, 2>(norm_output,
+                                                                                   output,
+                                                                                   input,
+                                                                                   bias,
+                                                                                   residual1,
+                                                                                   residual2,
+                                                                                   gamma,
+                                                                                   beta,
+                                                                                   layernorm_eps,
+                                                                                   m,
+                                                                                   half_n,
+                                                                                   grid,
+                                                                                   block,
+                                                                                   stream,
+                                                                                   opt_version,
+                                                                                   is_output);
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "residual_num must be 1 or 2");
+    }
+}
+
 template<typename T, int UNROLL_FACTOR>
 void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T*           norm_output,
                                                               T*           output,
@@ -1136,6 +1584,108 @@ void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T*           norm_
     }
 }
 
+template<typename T>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor(T*           norm_output,
+                                                                    T*           output,
+                                                                    const T*     input,
+                                                                    const T*     bias,
+                                                                    const T*     residual1,
+                                                                    const T*     residual2,
+                                                                    const T*     gamma,
+                                                                    const T*     beta,
+                                                                    float        layernorm_eps,
+                                                                    int          m,
+                                                                    int          half_n,
+                                                                    dim3         grid,
+                                                                    dim3         block,
+                                                                    cudaStream_t stream,
+                                                                    int          opt_version,
+                                                                    bool         is_output,
+                                                                    int          residual_num,
+                                                                    int          unroll_factor)
+{
+    switch (unroll_factor) {
+        case 1:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 1>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 2:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 2>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 4:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 4>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 8:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 8>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        default:
+            FT_CHECK_WITH_INFO(false, "unroll_factor must be 1, 2, 4 or 8");
+    }
+}
+
 template<typename T>
 void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T*           norm_output,
                                                                T*           output,
@@ -1263,6 +1813,105 @@ void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T*           norm
     }
 }
 
+template<typename T>
+void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,
+                                                   T*           norm_output,
+                                                   const T*     input,
+                                                   const T*     residual1,
+                                                   const T*     gamma,
+                                                   const T*     beta,
+                                                   const T*     bias,
+                                                   const float  layernorm_eps,
+                                                   int          m,
+                                                   int          n,
+                                                   cudaStream_t stream,
+                                                   int          opt_version)
+{
+    const int residual_num = 1;
+    if (opt_version > 0 && sizeof(T) == 2 && n % 2 == 0) {
+        dim3 grid(m);
+        int  half_n    = n / 2;
+        int  half_n_32 = (half_n + 31) / 32 * 32;
+        dim3 block(min(half_n_32, 512));
+        int  rolls_per_thread = half_n / block.x;
+        int  unroll_factor    = 8;
+        while (unroll_factor > rolls_per_thread && unroll_factor > 1) {
+            unroll_factor /= 2;
+        }
+
+        using T2 = typename TypeConverter<T>::Type;
+
+        /* we launch (and instantiate) the kernel by specializing for unroll_factor -> residual_num -> is_bias ->
+         * opt_version */
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor((T2*)norm_output,
+                                                                       (T2*)output,
+                                                                       (const T2*)input,
+                                                                       (const T2*)bias,
+                                                                       (const T2*)residual1,
+                                                                       (const T2*)nullptr,
+                                                                       (const T2*)gamma,
+                                                                       (const T2*)beta,
+                                                                       layernorm_eps,
+                                                                       m,
+                                                                       half_n,
+                                                                       grid,
+                                                                       block,
+                                                                       stream,
+                                                                       opt_version,
+                                                                       true,  // is_output
+                                                                       residual_num,
+                                                                       unroll_factor);
+    }
+    else {
+
+        dim3 grid(m);
+        dim3 block(min(n, 1024));
+
+        /* For general cases, n is equal to hidden_units, e.g., 512/1024.
+        Since we have warp shuffle inside the code, block.x % 32 should be 0.
+        */
+        block.x = (block.x + 31) / 32 * 32;
+
+        size_t maxbytes = n * sizeof(T);
+        if (residual_num == 1) {
+            if (maxbytes >= (48 << 10)) {
+                check_cuda_error(cudaFuncSetAttribute(
+                    generalLLaMAAddBiasResidualLayerNorm<T, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
+            }
+            generalLLaMAAddBiasResidualLayerNorm<T, 1><<<grid, block, maxbytes, stream>>>(
+                input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n);
+        }
+        else if (residual_num == 2) {
+            if (maxbytes >= (48 << 10)) {
+                check_cuda_error(cudaFuncSetAttribute(
+                    generalLLaMAAddBiasResidualLayerNorm<T, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
+            }
+            generalLLaMAAddBiasResidualLayerNorm<T, 2><<<grid, block, maxbytes, stream>>>(
+                input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n);
+        }
+    }
+}
+
+#define INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(T)                                           \
+    template void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,                                   \
+                                                                T*           norm_output,                              \
+                                                                const T*     input,                                    \
+                                                                const T*     residual1,                                \
+                                                                const T*     gamma,                                    \
+                                                                const T*     beta,                                     \
+                                                                const T*     bias,                                     \
+                                                                const float  layernorm_eps,                            \
+                                                                int          m,                                        \
+                                                                int          n,                                        \
+                                                                cudaStream_t stream,                                   \
+                                                                int          opt_version)
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(float);
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM
+
 /* output      <- output + bias + residual_1 + residual_2
  * output_norm <- LN(output) */
 template<typename T>
@@ -1875,29 +2524,29 @@ __global__ void generalLLaMALayerNorm(const T* __restrict input,
     extern __shared__ __align__(sizeof(float)) char _shmem[];
     T*                                              shmem = reinterpret_cast<T*>(_shmem);
 
-    __shared__ float s_mean_sq;
-    float            mean_sq = 0.0f;
+    __shared__ float s_variance;
+    float            variance = 0.0f;
 
     using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
     using Scalar_T       = typename packed_as<T, 1>::type;
 
-    float local_sum = 0.0f;
+    float local_var_sum = 0.0f;
     for (int i = tid; i < n; i += blockDim.x) {
         float val = (float)(ldg(&input[blockIdx.x * n + i]));
-        local_sum += val * val;
+        local_var_sum += val * val;
     }
 
-    mean_sq = blockReduceSum(local_sum);
+    variance = blockReduceSum(local_var_sum);
 
     if (threadIdx.x == 0) {
-        s_mean_sq = rsqrtf(mean_sq / (float)n + layernorm_eps);
+        s_variance = rsqrtf(variance / (float)n + layernorm_eps);
     }
     __syncthreads();
 
     for (int i = tid; i < n; i += blockDim.x) {
         const int index    = blockIdx.x * n + i;
         float     beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]);
-        T         val      = (T)(((float)input[index] * s_mean_sq) * (float)(ldg(&gamma[i])) + beta_val);
+        T         val      = (T)(((float)input[index] * s_variance) * (float)(ldg(&gamma[i])) + beta_val);
 
         normed_output[index] = val;
     }
diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h
index 5c5c03c7a..8fb8ecf8b 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.h
+++ b/src/fastertransformer/kernels/layernorm_kernels.h
@@ -62,6 +62,20 @@ void invokeAddBiasResidualLayerNorm(T*           out,
                                     const int    n,
                                     cudaStream_t stream);
 
+template<typename T>
+void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,
+                                                   T*           norm_output,
+                                                   const T*     input,
+                                                   const T*     residual1,
+                                                   const T*     gamma,
+                                                   const T*     beta,
+                                                   const T*     bias,
+                                                   const float  layernorm_eps,
+                                                   int          m,
+                                                   int          n,
+                                                   cudaStream_t stream,
+                                                   int          opt_version = 2);
+
 template<typename T>
 void invokeGeneralAddBiasResidualPreLayerNorm(T*           output,
                                               T*           norm_output,
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 8d4c5e6da..61d2a54ff 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1364,8 +1364,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int tidx     = threadIdx.x;
 
     const int total_seq_len = param.max_prefix_prompt_length + seq_len;
-
     const bool is_masked = tidx * vec_size >= size_per_head;
+
     // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts
     if (PREFIX_PROMPT && token_idx < 0) {
         const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length;
@@ -1407,7 +1407,6 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
     const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
 
-
     Vec_t q, k, v;
     Vec_t q_bias, k_bias, v_bias;
     if (!is_masked) {
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 2297b9999..70e638150 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -31,13 +31,10 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     //      input_query [token_num, hidden_dimension]
     //      attention_mask [batch_size, 1, seq_len, seq_len]
     //      attention_type [1]
-    //      is_final_layer [1], bool on cpu
     //      layer_id [1], int on cpu
     //      padding_offset, int, [token_num] (optional)
     //      cu_seqlens, int, [batch_size] (optional)
     //          each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head]
-    //      pre_layernorm_weights_gamma [hidden_dimension]
-    //      pre_layernorm_weights_beta  [hidden_dimension]
 
     // output_tensors:
     //      hidden_features [token_num, hidden_dimension]
@@ -46,13 +43,11 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 5);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
-    const int  request_batch_size          = input_tensors->at("attention_mask").shape[0];
-    const int  request_seq_len             = input_tensors->at("attention_mask").shape[2];
-    const int  layer_id                    = input_tensors->getVal<int>("layer_id");
-    const int* padding_offset              = input_tensors->getPtr<int>("padding_offset", nullptr);
-    int*       cu_seqlens                  = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    const T*   pre_layernorm_weights_gamma = input_tensors->getPtr<T>("pre_layernorm_weights_gamma");
-    const T*   pre_layernorm_weights_beta  = input_tensors->getPtr<T>("pre_layernorm_weights_beta");
+    const int  request_batch_size = input_tensors->at("attention_mask").shape[0];
+    const int  request_seq_len    = input_tensors->at("attention_mask").shape[2];
+    const int  layer_id           = input_tensors->getVal<int>("layer_id");
+    const int* padding_offset     = input_tensors->getPtr<int>("padding_offset", nullptr);
+    int*       cu_seqlens         = input_tensors->getPtr<int>("cu_seqlens", nullptr);
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -67,40 +62,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     POP_RANGE;
     sync_check_cuda_error();
 
-    const bool is_final = input_tensors->at("is_final_layer").getVal<bool>();
     const int  m        = input_tensors->at("input_query").shape[0];
 
-    PUSH_RANGE("attention buffer alloc");
-    invokeGeneralLLaMALayerNorm(decoder_normed_input_,
-                                attention_input,
-                                pre_layernorm_weights_gamma,
-                                pre_layernorm_weights_beta,
-                                layernorm_eps_,
-                                m,
-                                hidden_units_,
-                                stream_);
-    sync_check_cuda_error();
-    POP_RANGE;
-    //    if (l == 0) {
-    //        T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_);
-    //        cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_, cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < h_token_num; ++b) {
-    //            std::cout << "[";
-    //            int i = 0;
-    //            for (int h = 0; h < hidden_units_; ++h) {
-    //                std::cout << out[b * hidden_units_ + h] << " ";
-    //                ++i;
-    //                if (i == 8)
-    //                    break;
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
-    sync_check_cuda_error();
-
     PUSH_RANGE("qkv_gemm");
 
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
@@ -110,7 +73,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           hidden_units_,  // k
                           attention_weights->query_weight.kernel,
                           3 * hidden_units_,  // n
-                          decoder_normed_input_,
+                          attention_input,
                           hidden_units_,  // k
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
@@ -160,38 +123,38 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    0,  // int8_mode
                                    stream_);
     sync_check_cuda_error();
-    if (layer_id == 0) {
-        // shape: [B, H, L, Dh]
-        T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_);
-        T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_;
-        T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_;
-        cudaMemcpy(q_buf,
-                   q_buf_2_,
-                   sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_,
-                   cudaMemcpyDeviceToHost);
-        sync_check_cuda_error();
-
-        for (int b = 0; b < request_batch_size; ++b) {
-            std::cout << "[";
-            for (int h = 0; h < head_num_; ++h) {
-                std::cout << "[";
-                for (int s = 0; s < request_seq_len; ++s) {
-                    std::cout << "[";
-                    for (int e = 0; e < 8; ++e) {
-                        std::cout << k_buf[b * head_num_ * request_seq_len * size_per_head_
-                                           + h * request_seq_len * size_per_head_
-                                           + s * size_per_head_
-                                           + e]
-                                  << " ";
-                    }
-                    std::cout << "]\n";
-                }
-                std::cout << "]\n";
-            }
-            std::cout << "]\n";
-        }
-        std::cout << "\n";
-    }
+    //    if (layer_id == 0) {
+    //        // shape: [B, H, L, Dh]
+    //        T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_);
+    //        T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_;
+    //        T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_;
+    //        cudaMemcpy(q_buf,
+    //                   q_buf_2_,
+    //                   sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_,
+    //                   cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < request_batch_size; ++b) {
+    //            std::cout << "[";
+    //            for (int h = 0; h < head_num_; ++h) {
+    //                std::cout << "[";
+    //                for (int s = 0; s < request_seq_len; ++s) {
+    //                    std::cout << "[";
+    //                    for (int e = 0; e < 8; ++e) {
+    //                        std::cout << v_buf[b * head_num_ * request_seq_len * size_per_head_
+    //                                           + h * request_seq_len * size_per_head_
+    //                                           + s * size_per_head_
+    //                                           + e]
+    //                                  << " ";
+    //                    }
+    //                    std::cout << "]\n";
+    //                }
+    //                std::cout << "]\n";
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
 
     const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     // Use batch major
@@ -212,160 +175,211 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     // v_cache = (batch_size, num_heads, L, Dh)
     sync_check_cuda_error();
 
-    // TODO: fmha kernels doesn't support different seq lengths of q and kv
+    // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh)
+
+    POP_RANGE;
+
     if (attention_type == AttentionType::FUSED_MHA) {
         dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size);
         dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
     }
-    // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh)
-
-    POP_RANGE;
-    if (is_final == false) {
+    else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
         const int            attention_seq_len_1 = request_seq_len;  // q length
         const int            attention_seq_len_2 = request_seq_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
-        if (attention_type != AttentionType::FUSED_MHA) {
-            if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
-                PUSH_RANGE("Q*K batch gemm");
-                cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                                    CUBLAS_OP_N,
-                                                    attention_seq_len_2,  // n
-                                                    attention_seq_len_1,  // m
-                                                    size_per_head_,       // k
-                                                    1.0f,
-                                                    k_buf_2_,
-                                                    gemm_data_type,
-                                                    size_per_head_,                        // k
-                                                    attention_seq_len_2 * size_per_head_,  // n * k
-                                                    q_buf_2_,
-                                                    gemm_data_type,
-                                                    size_per_head_,                        // k
-                                                    attention_seq_len_1 * size_per_head_,  // m * k
-                                                    0.0f,
-                                                    qk_buf_float_,
-                                                    CUDA_R_32F,
-                                                    attention_seq_len_2,  // n
-                                                    attention_seq_len_2 * attention_seq_len_1,
-                                                    request_batch_size * head_num_,  // global batch size
-                                                    CUDA_R_32F);
-
-                sync_check_cuda_error();
-                POP_RANGE;
-
-                PUSH_RANGE("softmax");
-                MaskedSoftmaxParam<T, float> param;
-                param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-                param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
-                param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-                param.batch_size         = request_batch_size;
-                param.q_length           = attention_seq_len_1;
-                param.k_length           = attention_seq_len_2;
-                param.num_heads          = head_num_;
-                param.qk_scale           = qk_scale;
-                param.linear_bias_slopes = nullptr;
-                invokeMaskedSoftmax(param, stream_);
-                sync_check_cuda_error();
-                POP_RANGE;
-            }
-            else {
-                PUSH_RANGE("Q*K batch gemm");
-                cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                                    CUBLAS_OP_N,
-                                                    attention_seq_len_2,
-                                                    attention_seq_len_1,
-                                                    size_per_head_,
-                                                    k_buf_2_,
-                                                    size_per_head_,
-                                                    attention_seq_len_2 * size_per_head_,
-                                                    q_buf_2_,
-                                                    size_per_head_,
-                                                    attention_seq_len_1 * size_per_head_,
-                                                    qk_buf_,
-                                                    attention_seq_len_2,
-                                                    attention_seq_len_2 * attention_seq_len_1,
-                                                    request_batch_size * head_num_);
-
-                POP_RANGE;
-                PUSH_RANGE("softmax");
-                MaskedSoftmaxParam<T, T> param;
-                param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-                param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-                param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-                param.batch_size         = request_batch_size;
-                param.q_length           = attention_seq_len_1;
-                param.k_length           = attention_seq_len_2;
-                param.num_heads          = head_num_;
-                param.qk_scale           = qk_scale;
-                param.linear_bias_slopes = nullptr;
-                invokeMaskedSoftmax(param, stream_);
-                sync_check_cuda_error();
-                POP_RANGE;
-            }
-
-            PUSH_RANGE("QK*V batch gemm");
-
-            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+
+        //
+        // softmax(Q*K^T)
+        //
+        if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
+            PUSH_RANGE("Q*K batch gemm");
+            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                                CUBLAS_OP_N,
+                                                attention_seq_len_2,  // n
+                                                attention_seq_len_1,  // m
+                                                size_per_head_,       // k
+                                                1.0f,
+                                                k_buf_2_,
+                                                gemm_data_type,
+                                                size_per_head_,                        // k
+                                                attention_seq_len_2 * size_per_head_,  // n * k
+                                                q_buf_2_,
+                                                gemm_data_type,
+                                                size_per_head_,                        // k
+                                                attention_seq_len_1 * size_per_head_,  // m * k
+                                                0.0f,
+                                                qk_buf_float_,
+                                                CUDA_R_32F,
+                                                attention_seq_len_2,  // n
+                                                attention_seq_len_2 * attention_seq_len_1,
+                                                request_batch_size * head_num_,  // global batch size
+                                                CUDA_R_32F);
+
+            sync_check_cuda_error();
+            POP_RANGE;
+
+            PUSH_RANGE("softmax");
+            MaskedSoftmaxParam<T, float> param;
+            param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+            param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
+            param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+            param.batch_size         = request_batch_size;
+            param.q_length           = attention_seq_len_1;
+            param.k_length           = attention_seq_len_2;
+            param.num_heads          = head_num_;
+            param.qk_scale           = qk_scale;
+            param.linear_bias_slopes = nullptr;
+            invokeMaskedSoftmax(param, stream_);
+            sync_check_cuda_error();
+            POP_RANGE;
+        }
+        else {
+            PUSH_RANGE("Q*K batch gemm");
+            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                                 CUBLAS_OP_N,
-                                                size_per_head_,
-                                                attention_seq_len_1,
                                                 attention_seq_len_2,
-                                                v_buf_2_,
+                                                attention_seq_len_1,
+                                                size_per_head_,
+                                                k_buf_2_,
                                                 size_per_head_,
                                                 attention_seq_len_2 * size_per_head_,
-                                                qk_buf_,
-                                                attention_seq_len_2,
-                                                attention_seq_len_1 * attention_seq_len_2,
-                                                qkv_buf_2_,
+                                                q_buf_2_,
                                                 size_per_head_,
                                                 attention_seq_len_1 * size_per_head_,
+                                                qk_buf_,
+                                                attention_seq_len_2,
+                                                attention_seq_len_2 * attention_seq_len_1,
                                                 request_batch_size * head_num_);
 
-            // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
-            if (padding_offset == nullptr) {
-                invokeTransposeQKV(qkv_buf_3_,
-                                   qkv_buf_2_,
-                                   request_batch_size,
-                                   attention_seq_len_1,
-                                   head_num_,
-                                   size_per_head_,
-                                   attention_weights->attention_output_weight.scale,
-                                   0,  // int8_mode
-                                   stream_);
-                sync_check_cuda_error();
-            }
-            else {
-                invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
-                                                         qkv_buf_3_,
-                                                         m,
-                                                         request_batch_size,
-                                                         attention_seq_len_1,
-                                                         head_num_,
-                                                         size_per_head_,
-                                                         padding_offset,
-                                                         attention_weights->attention_output_weight.scale,
-                                                         0,  // int8_mode
-                                                         stream_);
-            }
             POP_RANGE;
+            PUSH_RANGE("softmax");
+            MaskedSoftmaxParam<T, T> param;
+            param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+            param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+            param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+            param.batch_size         = request_batch_size;
+            param.q_length           = attention_seq_len_1;
+            param.k_length           = attention_seq_len_2;
+            param.num_heads          = head_num_;
+            param.qk_scale           = qk_scale;
+            param.linear_bias_slopes = nullptr;
+            invokeMaskedSoftmax(param, stream_);
+            sync_check_cuda_error();
+            POP_RANGE;
+        }
+
+        PUSH_RANGE("QK*V batch gemm");
+        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                            CUBLAS_OP_N,
+                                            size_per_head_,
+                                            attention_seq_len_1,
+                                            attention_seq_len_2,
+                                            v_buf_2_,
+                                            size_per_head_,
+                                            attention_seq_len_2 * size_per_head_,
+                                            qk_buf_,
+                                            attention_seq_len_2,
+                                            attention_seq_len_1 * attention_seq_len_2,
+                                            qkv_buf_2_,
+                                            size_per_head_,
+                                            attention_seq_len_1 * size_per_head_,
+                                            request_batch_size * head_num_);
+
+        // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
+        if (padding_offset == nullptr) {
+            invokeTransposeQKV(qkv_buf_3_,
+                               qkv_buf_2_,
+                               request_batch_size,
+                               attention_seq_len_1,
+                               head_num_,
+                               size_per_head_,
+                               attention_weights->attention_output_weight.scale,
+                               0,  // int8_mode
+                               stream_);
+            sync_check_cuda_error();
+        }
+        else {
+            invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                                     qkv_buf_3_,
+                                                     m,
+                                                     request_batch_size,
+                                                     attention_seq_len_1,
+                                                     head_num_,
+                                                     size_per_head_,
+                                                     padding_offset,
+                                                     attention_weights->attention_output_weight.scale,
+                                                     0,  // int8_mode
+                                                     stream_);
         }
-        sync_check_cuda_error();
-
-        PUSH_RANGE("proj gemm");
-
-        cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                              CUBLAS_OP_N,
-                              hidden_units_,
-                              m,
-                              hidden_units_,
-                              attention_weights->attention_output_weight.kernel,
-                              hidden_units_,
-                              qkv_buf_3_,
-                              hidden_units_,
-                              attention_out,
-                              hidden_units_);
         POP_RANGE;
     }
+    sync_check_cuda_error();
+
+    //    if (layer_id == 0) {
+    //        // shape: [B, L, H]
+    //        T* qkv_buf = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_);
+    //        cudaMemcpy(qkv_buf,
+    //                   qkv_buf_3_,
+    //                   sizeof(T) * request_batch_size * request_seq_len * hidden_units_,
+    //                   cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < request_batch_size; ++b) {
+    //            std::cout << "[";
+    //            for (int s = 0; s < request_seq_len; ++s) {
+    //                std::cout << "[";
+    //                for (int h = 0; h < 8; ++h) {
+    //                    std::cout << qkv_buf[b * request_seq_len * hidden_units_
+    //                                       + s * hidden_units_
+    //                                       + h]
+    //                              << " ";
+    //                }
+    //                std::cout << "]\n";
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
+
+    PUSH_RANGE("proj gemm");
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          hidden_units_,
+                          m,
+                          hidden_units_,
+                          attention_weights->attention_output_weight.kernel,
+                          hidden_units_,
+                          qkv_buf_3_,
+                          hidden_units_,
+                          attention_out,
+                          hidden_units_);
+    POP_RANGE;
+    //    if (layer_id == 0) {
+    //        // shape: [B, L, H]
+    //        T* out = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_);
+    //        cudaMemcpy(out,
+    //                   attention_out,
+    //                   sizeof(T) * request_batch_size * request_seq_len * hidden_units_,
+    //                   cudaMemcpyDeviceToHost);
+    //        sync_check_cuda_error();
+    //
+    //        for (int b = 0; b < request_batch_size; ++b) {
+    //            std::cout << "[";
+    //            for (int s = 0; s < request_seq_len; ++s) {
+    //                std::cout << "[";
+    //                for (int h = 0; h < 8; ++h) {
+    //                    std::cout << out[b * request_seq_len * hidden_units_
+    //                                       + s * hidden_units_
+    //                                       + h]
+    //                              << " ";
+    //                }
+    //                std::cout << "]\n";
+    //            }
+    //            std::cout << "]\n";
+    //        }
+    //        std::cout << "\n";
+    //    }
 
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
@@ -501,8 +515,6 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
     q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
-    decoder_normed_input_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
@@ -537,7 +549,6 @@ void LLaMAContextAttentionLayer<T>::freeBuffer()
         allocator_->free((void**)(&qk_buf_));
         allocator_->free((void**)(&qkv_buf_2_));
         allocator_->free((void**)(&qkv_buf_3_));
-        allocator_->free((void**)(&decoder_normed_input_));
 
         if (is_qk_buf_float_ == true) {
             allocator_->free((void**)(&qk_buf_float_));
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index e52fdc0a7..e9086e278 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -37,7 +37,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     const size_t           hidden_units_;
     const size_t           rotary_embedding_dim_;
     const bool             neox_rotary_style_;
-    static constexpr float layernorm_eps_ = 1e-6f;
 
     // fmha runner
     int                        sm_ = getSMVersion();
@@ -72,7 +71,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     size_t mixed_gemm_ws_bytes_  = 0;
     char*  int8_gemm_workspace_  = nullptr;
     size_t int8_gemm_ws_bytes_   = 0;
-    T*     decoder_normed_input_ = nullptr;
 
 public:
     LLaMAContextAttentionLayer(size_t           max_batch_size,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index c74dd4663..3b4bb56c6 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -83,6 +83,9 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
 
+    output_logits_ = (T*)(allocator_->reMalloc(
+        output_logits_, sizeof(T) * batch_size * vocab_size_ * hidden_units_, false));
+
     is_allocate_buffer_ = true;
 }
 
@@ -110,6 +113,7 @@ void LLaMA<T>::freeBuffer()
 
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
+        allocator_->free((void**)(&output_logits_));
 
         is_allocate_buffer_ = false;
     }
@@ -379,29 +383,49 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-    //    invokeGeneralLLaMALayerNorm(
-    //            context_decoder_input_buf_,
-    //            embedding_input_buf_,
-    //            llama_weights->post_decoder_layernorm.gamma,
-    //            llama_weights->post_decoder_layernorm.beta,
-    //            layernorm_eps_,
-    //            batch_size * max_input_length,
-    //            hidden_units_,
-    //            stream_);
-    //    sync_check_cuda_error();
-    //
-    //    cublas_wrapper_->Gemm(CUBLAS_OP_N,
-    //                          CUBLAS_OP_N,
-    //                          batch_size * max_input_length,
-    //                          vocab_size_,
-    //                          hidden_units_,
-    //                          context_decoder_output_buf_,
-    //                          hidden_units_,  // n
-    //                          llama_weights->post_decoder_embedding.kernel,
-    //                          vocab_size_,  // k
-    //                          /* FIXME */,
-    //                          hidden_units_ /* n */);
-    //    sync_check_cuda_error();
+    if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
+        invokeGeneralLLaMALayerNorm(context_decoder_input_buf_,
+                                    context_decoder_output_buf_,
+                                    llama_weights->post_decoder_layernorm.gamma,
+                                    llama_weights->post_decoder_layernorm.beta,
+                                    layernorm_eps_,
+                                    batch_size * max_input_length,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                CUBLAS_OP_N,
+                vocab_size_,
+                batch_size * max_input_length,
+                hidden_units_,
+                llama_weights->post_decoder_embedding.kernel,
+                vocab_size_,
+                context_decoder_input_buf_,
+                hidden_units_,  // n
+                output_logits_,
+                vocab_size_);
+        sync_check_cuda_error();
+
+        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_);
+        cudaMemcpy(out,
+                   output_logits_,
+                   sizeof(T) * batch_size * max_input_length * vocab_size_,
+                   cudaMemcpyDeviceToHost);
+
+        for (int b = 0; b < batch_size; ++b) {
+            std::cout << "[";
+            for (int s = 0; s < max_input_length; ++s) {
+                std::cout << "[";
+                for (int v = 0; v < 8; ++v) {
+                    std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " ";
+                }
+                std::cout << "]\n";
+            }
+            std::cout << "]\n";
+        }
+        std::cout << "\n";
+    }
+
 
     setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
     sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 386a09cd4..51d3d4dc0 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -82,6 +82,7 @@ class LLaMA: public BaseLayer {
 
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
+    T*     output_logits_;
 
     // function pointer callback
     using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index c373c9d09..6a7857539 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -41,8 +41,8 @@ void LLaMAContextDecoder<T>::initialize()
                                                               false,
                                                               0);
 
-    ffn_layer_ = new GeluFfnLayer<T>(0,  // max_batch_size
-                                     0,
+    ffn_layer_ = new SiluFfnLayer<T>(0,  // max_batch_size
+                                     0,  // max_seq_len
                                      head_num_,
                                      size_per_head_,
                                      0,  // expert_num
@@ -52,8 +52,7 @@ void LLaMAContextDecoder<T>::initialize()
                                      allocator_,
                                      is_free_buffer_after_forward_,
                                      false,
-                                     0,
-                                     false  // use_gated_activation = false
+                                     true  // use_gated_activation = false
     );
 }
 
@@ -66,6 +65,8 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 template<typename T>
 void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
 {
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     self_attn_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     ffn_output_ = reinterpret_cast<T*>(
@@ -83,6 +84,7 @@ template<typename T>
 void LLaMAContextDecoder<T>::freeBuffer()
 {
     if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
         allocator_->free((void**)(&self_attn_output_));
         allocator_->free((void**)(&ffn_output_));
         allocator_->free((void**)(&decoder_layer_output_));
@@ -291,26 +293,43 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
         }
 
+        invokeGeneralLLaMALayerNorm(decoder_normed_input_,
+                                    layer_input,
+                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                    layernorm_eps_,
+                                    h_token_num,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+        //    if (l == 0) {
+        //        T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_);
+        //        cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_,
+        //        cudaMemcpyDeviceToHost); sync_check_cuda_error();
+        //
+        //        for (int b = 0; b < h_token_num; ++b) {
+        //            std::cout << "[";
+        //            int i = 0;
+        //            for (int h = 0; h < hidden_units_; ++h) {
+        //                std::cout << out[b * hidden_units_ + h] << " ";
+        //                ++i;
+        //                if (i == 8)
+        //                    break;
+        //            }
+        //            std::cout << "]\n";
+        //        }
+        //        std::cout << "\n";
+        //    }
+
         TensorMap self_attention_input_tensors{
-            {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}},
+            {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
                     {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
                     attention_mask}},
             {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
-            {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
-            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
-            {"pre_layernorm_weights_gamma",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {(size_t)hidden_units_},
-                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma}},
-            {"pre_layernorm_weights_beta",
-             Tensor{MEMORY_GPU,
-                    data_type,
-                    {(size_t)hidden_units_},
-                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta}}};
+            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
 
         if (is_unpadded_mha) {
             self_attention_input_tensors.insert("padding_offset",
@@ -332,68 +351,134 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
+        //        if (l == 0) {
+        //            // shape: [B, L, H]
+        //            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
+        //            cudaMemcpy(
+        //                out, self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_,
+        //                cudaMemcpyDeviceToHost);
+        //            sync_check_cuda_error();
+        //
+        //            for (int b = 0; b < batch_size; ++b) {
+        //                std::cout << "[";
+        //                for (int s = 0; s < seq_len; ++s) {
+        //                    std::cout << "[";
+        //                    for (int h = 0; h < 8; ++h) {
+        //                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
+        //                    }
+        //                    std::cout << "]\n";
+        //                }
+        //                std::cout << "]\n";
+        //            }
+        //            std::cout << "\n";
+        //        }
+
+        invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
+            self_attn_output_,
+            layer_input,
+            self_attn_output_,
+            layer_input,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+            llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+            layernorm_eps_,
+            h_token_num,
+            hidden_units_,
+            stream_);
+        sync_check_cuda_error();
+
+        //        if (l == 0) {
+        //            // shape: [B, L, H]
+        //            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
+        //            cudaMemcpy(
+        //                out, layer_input, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
+        //            sync_check_cuda_error();
+        //
+        //            for (int b = 0; b < batch_size; ++b) {
+        //                std::cout << "[";
+        //                for (int s = 0; s < seq_len; ++s) {
+        //                    std::cout << "[";
+        //                    for (int h = 0; h < 8; ++h) {
+        //                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
+        //                    }
+        //                    std::cout << "]\n";
+        //                }
+        //                std::cout << "]\n";
+        //            }
+        //            std::cout << "\n";
+        //        }
+
+        TensorMap ffn_input_tensors(
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}});
+        TensorMap ffn_output_tensors(
+            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
+        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
+
+        invokeAddBiasResidual(layer_output,
+                              self_attn_output_,
+                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                              h_token_num,
+                              hidden_units_,
+                              stream_);
+
+        sync_check_cuda_error();
+
+//        if (l == 0) {
+//            // shape: [B, L, H]
+//            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
+//            cudaMemcpy(out, layer_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
+//            sync_check_cuda_error();
+//
+//            for (int b = 0; b < batch_size; ++b) {
+//                std::cout << "[";
+//                for (int s = 0; s < seq_len; ++s) {
+//                    std::cout << "[";
+//                    for (int h = 0; h < 8; ++h) {
+//                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
+//                    }
+//                    std::cout << "]\n";
+//                }
+//                std::cout << "]\n";
+//            }
+//            std::cout << "\n";
+//        }
+
+        if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+            && pipeline_para_.world_size_ > 1) {
+            int data_size = h_token_num * hidden_units_;
+            ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
+        }
 
-        if (is_final == false) {
-            invokeGeneralAddBiasResidualPreLayerNorm(
-                self_attn_output_,
-                layer_input,
-                self_attn_output_,
-                layer_input,
-                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
-                llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
-                llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
-                layernorm_eps_,
-                h_token_num,
-                hidden_units_,
-                (float*)nullptr,
-                (float*)nullptr,
-                (float*)nullptr,
-                (float*)nullptr,
-                0,
-                stream_);
-
-            TensorMap ffn_input_tensors(
-                {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}});
-            TensorMap ffn_output_tensors(
-                {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
-            ffn_layer_->forward(
-                &ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
-
-            invokeAddBiasResidual(layer_output,
-                                  self_attn_output_,
-                                  llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                                  h_token_num,
-                                  hidden_units_,
-                                  stream_);
-
-            sync_check_cuda_error();
-
-            if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
-                && pipeline_para_.world_size_ > 1) {
-                int data_size = h_token_num * hidden_units_;
-                ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
-            }
-
-            if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                invokeRebuildPadding(decoder_output,
-                                     decoder_layer_output_,
-                                     padding_offset_,
-                                     h_token_num,
-                                     head_num_ * size_per_head_,
-                                     stream_);
-            }
+        if ((l == num_layer_ - 1) && is_unpadded_mha) {
+            invokeRebuildPadding(decoder_output,
+                                 decoder_layer_output_,
+                                 padding_offset_,
+                                 h_token_num,
+                                 head_num_ * size_per_head_,
+                                 stream_);
         }
     }
 
-    // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
-    invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
-                                       output_tensors->at("decoder_output").getPtr<T>(),
-                                       input_tensors->at("input_lengths").getPtr<int>(),
-                                       seq_len,
-                                       batch_size,
-                                       hidden_units_,
-                                       stream_);
-    sync_check_cuda_error();
+//    if (pipeline_para_.rank_ == pipeline_para_.world_size_ -1) {
+//        // shape: [B, L, H]
+//        T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
+//        cudaMemcpy(out, decoder_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
+//        sync_check_cuda_error();
+//
+//        for (int b = 0; b < batch_size; ++b) {
+//            std::cout << "[";
+//            for (int s = 0; s < seq_len; ++s) {
+//                std::cout << "[";
+//                for (int h = 0; h < 8; ++h) {
+//                    std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
+//                }
+//                std::cout << "]\n";
+//            }
+//            std::cout << "]\n";
+//        }
+//        std::cout << "\n";
+//    }
+
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
index ff2ec11be..6f3a7721f 100644
--- a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -162,12 +162,12 @@ void LLaMADecoderLayerWeight<T>::setWeightPtr()
     self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
     self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
 
-    ffn_weights.intermediate_weight.kernel = weights_ptr[6];
-    ffn_weights.intermediate_weight.bias   = weights_ptr[7];
-    ffn_weights.output_weight.kernel       = weights_ptr[8];
-    ffn_weights.output_weight.bias         = weights_ptr[9];
-    ffn_weights.gating_weight.kernel       = weights_ptr[10];
-    ffn_weights.gating_weight.bias         = weights_ptr[11];
+    ffn_weights.intermediate_weight.kernel  = weights_ptr[6];
+    ffn_weights.intermediate_weight.bias    = weights_ptr[7];
+    ffn_weights.output_weight.kernel        = weights_ptr[8];
+    ffn_weights.output_weight.bias          = weights_ptr[9];
+    ffn_weights.intermediate_weight2.kernel = weights_ptr[10];
+    ffn_weights.intermediate_weight2.bias   = weights_ptr[11];
 
     post_attention_layernorm_weights.beta  = weights_ptr[12];
     post_attention_layernorm_weights.gamma = weights_ptr[13];

From 4434e65e742474d91acba0b00d73b82822497f66 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 16 Sep 2023 20:27:32 +0000
Subject: [PATCH 15/55]  remove debugging code print

---
 src/fastertransformer/models/llama/LLaMA.cc | 36 ++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 3b4bb56c6..6285b804b 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -406,24 +406,24 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                 vocab_size_);
         sync_check_cuda_error();
 
-        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_);
-        cudaMemcpy(out,
-                   output_logits_,
-                   sizeof(T) * batch_size * max_input_length * vocab_size_,
-                   cudaMemcpyDeviceToHost);
-
-        for (int b = 0; b < batch_size; ++b) {
-            std::cout << "[";
-            for (int s = 0; s < max_input_length; ++s) {
-                std::cout << "[";
-                for (int v = 0; v < 8; ++v) {
-                    std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " ";
-                }
-                std::cout << "]\n";
-            }
-            std::cout << "]\n";
-        }
-        std::cout << "\n";
+//        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_);
+//        cudaMemcpy(out,
+//                   output_logits_,
+//                   sizeof(T) * batch_size * max_input_length * vocab_size_,
+//                   cudaMemcpyDeviceToHost);
+//
+//        for (int b = 0; b < batch_size; ++b) {
+//            std::cout << "[";
+//            for (int s = 0; s < max_input_length; ++s) {
+//                std::cout << "[";
+//                for (int v = 0; v < 8; ++v) {
+//                    std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " ";
+//                }
+//                std::cout << "]\n";
+//            }
+//            std::cout << "]\n";
+//        }
+//        std::cout << "\n";
     }
 
 

From 95a7efe0b69a872f671b2dd9fd7d4453feae7e5f Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 16 Sep 2023 20:55:50 +0000
Subject: [PATCH 16/55] remove debugging code

---
 examples/cpp/llama/llama_example.cc           |  49 +++---
 .../LLaMAContextAttentionLayer.cc             | 103 -------------
 src/fastertransformer/models/llama/LLaMA.cc   | 139 ++----------------
 src/fastertransformer/models/llama/LLaMA.h    |   5 -
 .../models/llama/LLaMAContextDecoder.cc       | 100 -------------
 5 files changed, 44 insertions(+), 352 deletions(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 43f55c4b7..a558bbf65 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -213,10 +213,8 @@ void llama_example(const INIReader reader)
                               &prop,
                               attention_type);
 
-    int* d_output_ids;
-    int* d_sequence_lengths;
-    deviceMalloc(&d_output_ids, request_batch_size * total_output_len, false);
-    deviceMalloc(&d_sequence_lengths, request_batch_size, false);
+    T* d_output_logits;
+    deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false);
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -226,18 +224,14 @@ void llama_example(const INIReader reader)
          Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
         {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
         {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}},
-        {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &max_cache_seq_len}}
-    };
+        {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &max_cache_seq_len}}};
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
-        {"output_ids",
+        {"output_logits",
          Tensor{MEMORY_GPU,
-                TYPE_INT32,
-                std::vector<size_t>{request_batch_size, 1, (size_t)total_output_len},
-                d_output_ids}},
-        {"sequence_length",
-         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_sequence_lengths}},
-    };
+                TYPE_FP16,
+                std::vector<size_t>{request_batch_size, (size_t)total_output_len, vocab_size},
+                d_output_logits}}};
 
     print_mem_usage();
 
@@ -259,6 +253,25 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
 
+//    if (rank == world_size-1) {
+//        T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
+//        cudaMemcpy(
+//            out, d_output_logits, sizeof(T) * request_batch_size * total_output_len * vocab_size, cudaMemcpyDeviceToHost);
+//        for (int b = 0; b < request_batch_size; ++b) {
+//            std::cout << "[";
+//            for (int s = 0; s < total_output_len; ++s) {
+//                std::cout << "[";
+//                for (int v = vocab_size-8; v < vocab_size; ++v) {
+//                    std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " ";
+//                }
+//                std::cout << "]\n";
+//            }
+//            std::cout << "]\n";
+//        }
+//        std::cout << "\n";
+//    }
+
+    /*
     if (rank == 0) {
 
         std::string fName   = "out";
@@ -269,7 +282,7 @@ void llama_example(const INIReader reader)
         else {
             size_t outCount = total_output_len * request_batch_size;
             int*   hBuf     = new int[outCount];
-            cudaD2Hcpy(hBuf, d_output_ids, outCount);
+            cudaD2Hcpy(hBuf, d_output_logits, outCount);
 
             {
                 std::cout << "Writing " << outCount << " elements\n";
@@ -295,6 +308,7 @@ void llama_example(const INIReader reader)
             delete[] hBuf;
         }
     }
+    */
 
     // test time
     struct timeval start, end;
@@ -339,11 +353,8 @@ void llama_example(const INIReader reader)
     if (d_input_lengths != nullptr) {
         cudaFree(d_input_lengths);
     }
-    if (d_output_ids != nullptr) {
-        deviceFree(d_output_ids);
-    }
-    if (d_sequence_lengths != nullptr) {
-        deviceFree(d_sequence_lengths);
+    if (d_output_logits != nullptr) {
+        deviceFree(d_output_logits);
     }
 
     return;
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 70e638150..10e39fd39 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -78,26 +78,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-    //    if (layer_id == 0) {
-    //        T* qkv_buf = (T*)malloc(sizeof(T) * m * 3 * hidden_units_);
-    //        cudaMemcpy(qkv_buf, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < request_batch_size; ++b) {
-    //            std::cout << "[";
-    //            for (int s = 0; s < request_seq_len; ++s) {
-    //                std::cout << "[";
-    //                for (int h = 0; h < 8; ++h) {
-    //                    std::cout << qkv_buf[((b * request_seq_len) + s) * 3 * hidden_units_ + h + 2 * hidden_units_]
-    //                    << " ";
-    //                }
-    //                std::cout << "]\n";
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
-
     // IDEA: append prefix prompt key value here
     PrefixPromptBatchWeightsParam<T> param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_};
 
@@ -123,38 +103,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    0,  // int8_mode
                                    stream_);
     sync_check_cuda_error();
-    //    if (layer_id == 0) {
-    //        // shape: [B, H, L, Dh]
-    //        T* q_buf = (T*)malloc(sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_);
-    //        T* k_buf = q_buf + request_batch_size * request_seq_len * hidden_units_;
-    //        T* v_buf = k_buf + request_batch_size * request_seq_len * hidden_units_;
-    //        cudaMemcpy(q_buf,
-    //                   q_buf_2_,
-    //                   sizeof(T) * 3 * request_batch_size * request_seq_len * hidden_units_,
-    //                   cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < request_batch_size; ++b) {
-    //            std::cout << "[";
-    //            for (int h = 0; h < head_num_; ++h) {
-    //                std::cout << "[";
-    //                for (int s = 0; s < request_seq_len; ++s) {
-    //                    std::cout << "[";
-    //                    for (int e = 0; e < 8; ++e) {
-    //                        std::cout << v_buf[b * head_num_ * request_seq_len * size_per_head_
-    //                                           + h * request_seq_len * size_per_head_
-    //                                           + s * size_per_head_
-    //                                           + e]
-    //                                  << " ";
-    //                    }
-    //                    std::cout << "]\n";
-    //                }
-    //                std::cout << "]\n";
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
 
     const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     // Use batch major
@@ -316,32 +264,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     }
     sync_check_cuda_error();
 
-    //    if (layer_id == 0) {
-    //        // shape: [B, L, H]
-    //        T* qkv_buf = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_);
-    //        cudaMemcpy(qkv_buf,
-    //                   qkv_buf_3_,
-    //                   sizeof(T) * request_batch_size * request_seq_len * hidden_units_,
-    //                   cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < request_batch_size; ++b) {
-    //            std::cout << "[";
-    //            for (int s = 0; s < request_seq_len; ++s) {
-    //                std::cout << "[";
-    //                for (int h = 0; h < 8; ++h) {
-    //                    std::cout << qkv_buf[b * request_seq_len * hidden_units_
-    //                                       + s * hidden_units_
-    //                                       + h]
-    //                              << " ";
-    //                }
-    //                std::cout << "]\n";
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
-
     PUSH_RANGE("proj gemm");
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
                           CUBLAS_OP_N,
@@ -355,31 +277,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           attention_out,
                           hidden_units_);
     POP_RANGE;
-    //    if (layer_id == 0) {
-    //        // shape: [B, L, H]
-    //        T* out = (T*)malloc(sizeof(T) * request_batch_size * request_seq_len * hidden_units_);
-    //        cudaMemcpy(out,
-    //                   attention_out,
-    //                   sizeof(T) * request_batch_size * request_seq_len * hidden_units_,
-    //                   cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < request_batch_size; ++b) {
-    //            std::cout << "[";
-    //            for (int s = 0; s < request_seq_len; ++s) {
-    //                std::cout << "[";
-    //                for (int h = 0; h < 8; ++h) {
-    //                    std::cout << out[b * request_seq_len * hidden_units_
-    //                                       + s * hidden_units_
-    //                                       + h]
-    //                              << " ";
-    //                }
-    //                std::cout << "]\n";
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
 
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 6285b804b..02d46b5b9 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -83,9 +83,6 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
 
-    output_logits_ = (T*)(allocator_->reMalloc(
-        output_logits_, sizeof(T) * batch_size * vocab_size_ * hidden_units_, false));
-
     is_allocate_buffer_ = true;
 }
 
@@ -113,7 +110,6 @@ void LLaMA<T>::freeBuffer()
 
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
-        allocator_->free((void**)(&output_logits_));
 
         is_allocate_buffer_ = false;
     }
@@ -245,27 +241,15 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      max_cache_seq_len [batch_size] on cpu
 
     // output_tensors:
-    //      output_ids [batch_size, 1, max_output_seq_len]
-    //      sequence_length [batch_size]
-
-    // Step is from max_input_length ~ max_output_seq_len,
-    // When step = k,  we put output ids and caches at step k, and the sequence_length would be k - 1 before
-    // complete this step.
-    // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy
-    // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data
+    //      output_logits [batch_size, max_output_seq_len, vocab_size]
 
     FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3");
-    FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
     FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end()
              && input_tensors->at("output_seq_len").shape.size() == 1);
-    FT_CHECK(output_tensors->at("output_ids").shape.size() == 3);
-    FT_CHECK(output_tensors->at("sequence_length").shape.size() == 1);
-    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0],
-                       "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]");
 
-    const size_t batch_size = output_tensors->at("output_ids").shape[0];
+    const size_t batch_size = input_tensors->at("input_ids").shape[0];
 
     // NOTE: Prefix Prompt PreProcessing
     // get prefix_prompt_weight for each batch --> shape [batch, 1]
@@ -332,27 +316,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                              hidden_units_,
                                              stream_);
     sync_check_cuda_error();
-    //    if (pipeline_para_.rank_ == 0) {
-    //        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * hidden_units_);
-    //        cudaMemcpy(out,
-    //                   context_decoder_input_buf_,
-    //                   sizeof(T) * batch_size * max_input_length * hidden_units_,
-    //                   cudaMemcpyDeviceToHost);
-    //        sync_check_cuda_error();
-    //
-    //        for (int b = 0; b < batch_size; ++b) {
-    //            std::cout << "[";
-    //            for (int s = 0; s < max_input_length; ++s) {
-    //                std::cout << "[";
-    //                for (int h = 0; h < 8; ++h) {
-    //                    std::cout << out[b * batch_size * hidden_units_  + s * hidden_units_ + h] << " ";
-    //                }
-    //                std::cout << "]\n";
-    //            }
-    //            std::cout << "]\n";
-    //        }
-    //        std::cout << "\n";
-    //    }
 
     invokeBuildDecoderAttentionMask(
         input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_);
@@ -384,6 +347,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
+        T* output_logits = output_tensors->at("output_logits").getPtr<T>();
         invokeGeneralLLaMALayerNorm(context_decoder_input_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
@@ -394,41 +358,20 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                     stream_);
         sync_check_cuda_error();
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                CUBLAS_OP_N,
-                vocab_size_,
-                batch_size * max_input_length,
-                hidden_units_,
-                llama_weights->post_decoder_embedding.kernel,
-                vocab_size_,
-                context_decoder_input_buf_,
-                hidden_units_,  // n
-                output_logits_,
-                vocab_size_);
+                              CUBLAS_OP_N,
+                              vocab_size_,
+                              batch_size * max_input_length,
+                              hidden_units_,
+                              llama_weights->post_decoder_embedding.kernel,
+                              vocab_size_,
+                              context_decoder_input_buf_,
+                              hidden_units_,  // n
+                              output_logits,
+                              vocab_size_);
         sync_check_cuda_error();
-
-//        T* out = (T*)malloc(sizeof(T) * batch_size * max_input_length * vocab_size_);
-//        cudaMemcpy(out,
-//                   output_logits_,
-//                   sizeof(T) * batch_size * max_input_length * vocab_size_,
-//                   cudaMemcpyDeviceToHost);
-//
-//        for (int b = 0; b < batch_size; ++b) {
-//            std::cout << "[";
-//            for (int s = 0; s < max_input_length; ++s) {
-//                std::cout << "[";
-//                for (int v = 0; v < 8; ++v) {
-//                    std::cout << out[b * max_input_length * vocab_size_ + s * vocab_size_ + v] << " ";
-//                }
-//                std::cout << "]\n";
-//            }
-//            std::cout << "]\n";
-//        }
-//        std::cout << "\n";
     }
 
-
-    setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
-    sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
+    // sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
 }
 
 template<typename T>
@@ -467,60 +410,6 @@ void LLaMA<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Te
     ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
 }
 
-template<typename T>
-void LLaMA<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                const std::unordered_map<std::string, Tensor>* input_tensors,
-                                const size_t                                   max_input_length,
-                                const size_t                                   max_output_seq_len)
-{
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) {
-        return;
-    }
-
-    const size_t batch_size       = output_tensors->at("output_ids").shape[0];
-    uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
-
-    if (input_tensors->at("input_ids").shape[1] == 0) {
-        invokeCudaD2DcpyConvert(
-            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
-        // TODO: D2D sequence_lenghts
-        // For sampling, only copy the results to output_tensor
-        invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
-                              output_ids_buf_ + batch_size,
-                              max_output_seq_len - 1,
-                              batch_size,
-                              1,
-                              stream_);
-    }
-    else {
-
-        // For sampling, it is equivalent to all parent ids are 0.
-        gatherTreeParam param;
-        param.beams                = transposed_output_ids_buf_;
-        param.max_sequence_lengths = sequence_lengths_;
-        // add sequence_length 1 here because the sequence_length of time step t is t - 1
-        param.max_sequence_length_final_step  = 1;
-        param.max_time                        = max_output_seq_len;
-        param.batch_size                      = batch_size;
-        param.beam_width                      = 1;
-        param.step_ids                        = output_ids_buf_;
-        param.parent_ids                      = nullptr;
-        param.end_tokens                      = end_ids_buf_;
-        param.max_input_length                = max_input_length;
-        param.prefix_soft_prompt_lengths      = nullptr;
-        param.input_lengths                   = tiled_input_lengths_buf_;
-        param.max_prefix_soft_prompt_length   = 0;
-        param.max_input_without_prompt_length = max_input_length;
-        param.stream                          = stream_;
-        param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
-        invokeGatherTree(param);
-        invokeCudaD2DcpyConvert(
-            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
-        sync_check_cuda_error();
-    }
-}
-
 template<typename T>
 size_t LLaMA<T>::getPipelineParallelRank()
 {
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 51d3d4dc0..3b7995927 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -82,17 +82,12 @@ class LLaMA: public BaseLayer {
 
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
-    T*     output_logits_;
 
     // function pointer callback
     using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
     callback_sig* token_generated_cb_  = nullptr;
     void*         token_generated_ctx_ = nullptr;
 
-    void setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
-                          const std::unordered_map<std::string, Tensor>* input_tensors,
-                          const size_t                                   max_input_length,
-                          const size_t                                   max_seq_len);
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
 
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 6a7857539..49af917de 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -302,24 +302,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
-        //    if (l == 0) {
-        //        T* out = (T*)malloc(sizeof(T) * h_token_num * hidden_units_);
-        //        cudaMemcpy(out, decoder_normed_input_, sizeof(T) * h_token_num * hidden_units_,
-        //        cudaMemcpyDeviceToHost); sync_check_cuda_error();
-        //
-        //        for (int b = 0; b < h_token_num; ++b) {
-        //            std::cout << "[";
-        //            int i = 0;
-        //            for (int h = 0; h < hidden_units_; ++h) {
-        //                std::cout << out[b * hidden_units_ + h] << " ";
-        //                ++i;
-        //                if (i == 8)
-        //                    break;
-        //            }
-        //            std::cout << "]\n";
-        //        }
-        //        std::cout << "\n";
-        //    }
 
         TensorMap self_attention_input_tensors{
             {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -351,27 +333,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
-        //        if (l == 0) {
-        //            // shape: [B, L, H]
-        //            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
-        //            cudaMemcpy(
-        //                out, self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_,
-        //                cudaMemcpyDeviceToHost);
-        //            sync_check_cuda_error();
-        //
-        //            for (int b = 0; b < batch_size; ++b) {
-        //                std::cout << "[";
-        //                for (int s = 0; s < seq_len; ++s) {
-        //                    std::cout << "[";
-        //                    for (int h = 0; h < 8; ++h) {
-        //                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
-        //                    }
-        //                    std::cout << "]\n";
-        //                }
-        //                std::cout << "]\n";
-        //            }
-        //            std::cout << "\n";
-        //        }
 
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
@@ -387,27 +348,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             stream_);
         sync_check_cuda_error();
 
-        //        if (l == 0) {
-        //            // shape: [B, L, H]
-        //            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
-        //            cudaMemcpy(
-        //                out, layer_input, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
-        //            sync_check_cuda_error();
-        //
-        //            for (int b = 0; b < batch_size; ++b) {
-        //                std::cout << "[";
-        //                for (int s = 0; s < seq_len; ++s) {
-        //                    std::cout << "[";
-        //                    for (int h = 0; h < 8; ++h) {
-        //                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
-        //                    }
-        //                    std::cout << "]\n";
-        //                }
-        //                std::cout << "]\n";
-        //            }
-        //            std::cout << "\n";
-        //        }
-
         TensorMap ffn_input_tensors(
             {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}});
         TensorMap ffn_output_tensors(
@@ -423,26 +363,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         sync_check_cuda_error();
 
-//        if (l == 0) {
-//            // shape: [B, L, H]
-//            T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
-//            cudaMemcpy(out, layer_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
-//            sync_check_cuda_error();
-//
-//            for (int b = 0; b < batch_size; ++b) {
-//                std::cout << "[";
-//                for (int s = 0; s < seq_len; ++s) {
-//                    std::cout << "[";
-//                    for (int h = 0; h < 8; ++h) {
-//                        std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
-//                    }
-//                    std::cout << "]\n";
-//                }
-//                std::cout << "]\n";
-//            }
-//            std::cout << "\n";
-//        }
-
         if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
@@ -459,26 +379,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
-//    if (pipeline_para_.rank_ == pipeline_para_.world_size_ -1) {
-//        // shape: [B, L, H]
-//        T* out = (T*)malloc(sizeof(T) * batch_size * seq_len * hidden_units_);
-//        cudaMemcpy(out, decoder_output, sizeof(T) * batch_size * seq_len * hidden_units_, cudaMemcpyDeviceToHost);
-//        sync_check_cuda_error();
-//
-//        for (int b = 0; b < batch_size; ++b) {
-//            std::cout << "[";
-//            for (int s = 0; s < seq_len; ++s) {
-//                std::cout << "[";
-//                for (int h = 0; h < 8; ++h) {
-//                    std::cout << out[b * seq_len * hidden_units_ + s * hidden_units_ + h] << " ";
-//                }
-//                std::cout << "]\n";
-//            }
-//            std::cout << "]\n";
-//        }
-//        std::cout << "\n";
-//    }
-
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }

From 0a0015d61fcf9fed413a491f870b7e2b1e88eba5 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 18 Sep 2023 04:05:24 +0000
Subject: [PATCH 17/55] LLaMA Constructor fix

---
 src/fastertransformer/th_op/CMakeLists.txt |  3 +
 src/fastertransformer/th_op/llama/LLaMA.h  | 97 +++++++++-------------
 2 files changed, 44 insertions(+), 56 deletions(-)

diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt
index b9f2b9151..4e8d82d30 100644
--- a/src/fastertransformer/th_op/CMakeLists.txt
+++ b/src/fastertransformer/th_op/CMakeLists.txt
@@ -32,6 +32,7 @@ add_subdirectory(t5)
 add_subdirectory(bart)
 add_subdirectory(bert)
 add_subdirectory(deberta)
+add_subdirectory(llama)
 
 add_library(th_transformer SHARED
             $<TARGET_OBJECTS:th_bart>
@@ -49,6 +50,7 @@ add_library(th_transformer SHARED
             $<TARGET_OBJECTS:th_t5>
             $<TARGET_OBJECTS:th_utils>
             $<TARGET_OBJECTS:th_vit>
+            $<TARGET_OBJECTS:th_llama>
 )
 target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}"
                       th_bart
@@ -66,6 +68,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}"
                       th_t5
                       th_utils
                       th_vit
+                      th_llama
 )
 
 if(ENABLE_FP8)
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 1aac8a7d7..9a5efa3d0 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -50,18 +50,18 @@ template<typename T>
 class FTLLaMA: public IFLLaMA {
 public:
     FTLLaMA(const size_t             head_num,
-              const size_t             size_per_head,
-              const size_t             inter_size,
-              const size_t             layer_num,
-              const size_t             vocab_size,
-              const size_t             rotary_embedding_dim,
-              const int                start_id,
-              const int                end_id,
-              const int64_t            tensor_para_size,
-              const int64_t            pipeline_para_size,
-              const size_t             max_seq_len,
-              const bool               use_gptj_residual,
-              const vector<th::Tensor> weights):
+            const size_t             size_per_head,
+            const size_t             inter_size,
+            const size_t             layer_num,
+            const size_t             vocab_size,
+            const size_t             rotary_embedding_dim,
+            const int                start_id,
+            const int                end_id,
+            const int64_t            tensor_para_size,
+            const int64_t            pipeline_para_size,
+            const size_t             max_seq_len,
+            const bool               use_gptj_residual,
+            const vector<th::Tensor> weights):
         head_num_(head_num),
         size_per_head_(size_per_head),
         inter_size_(inter_size),
@@ -114,7 +114,7 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
 
-        llama_weights_.setMaxSeqLen(max_seq_len);
+        //llama_weights_.setMaxSeqLen(max_seq_len);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
     }
@@ -172,35 +172,20 @@ class FTLLaMA: public IFLLaMA {
                                                                    false,  // with_relative_position_bias
                                                                    true);  // causal_mask
 
-        ft::LLaMA<T> llama = ft::LLaMA<T>(head_num_,
-                                            size_per_head_,
-                                            inter_size_,
-                                            layer_num_,
-                                            vocab_size_,
-                                            rotary_embedding_dim_,
-                                            start_id_,
-                                            end_id_,
-                                            end_id_ + 1,  // p/prompt tuning virtual token start id
-                                            ft::PromptLearningType::no_prompt,
-                                            use_gptj_residual_,
-                                            0.0f,  // beam_search_diversity_rate,
-                                            1,     // top_k,
-                                            0.0,   // top_p,
-                                            0,     // random_seed,
-                                            1.0f,  // temperature,
-                                            1.0f,  // len_penalty,
-                                            1.0f,  // repetition_penalty,
-                                            tensor_para_,
-                                            pipeline_para_,
-                                            stream,
-                                            &cublas_wrapper,
-                                            &allocator,
-                                            false,           // is_free_buffer_after_forward
-                                            &prop_,          // cuda_device_prop
-                                            attention_type,  // attention_type
-                                            nullptr,         // custom_all_reduce_comm
-                                            0);              // enable_custom_all_reduce
-
+        ft::LLaMA<T>          llama = ft::LLaMA<T>(head_num_,
+                                          size_per_head_,
+                                          inter_size_,
+                                          layer_num_,
+                                          vocab_size_,
+                                          rotary_embedding_dim_,
+                                          0,  // random_seed,
+                                          stream,
+                                          &cublas_wrapper,
+                                          &allocator,
+                                          false,          // is_free_buffer_after_forward
+                                          &prop_,         // cuda_device_prop
+                                          attention_type  // attention_type
+                                    );
         std::vector<uint32_t> output_seq_len(request_batch_size, total_output_len);
 
         std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
@@ -297,7 +282,7 @@ class FTLLaMA: public IFLLaMA {
     std::mutex*             cublas_wrapper_mutex_;
     ft::cublasAlgoMap*      cublas_algo_map_;
     struct cudaDeviceProp   prop_;
-    ft::LLaMAWeight<T>    llama_weights_;
+    ft::LLaMAWeight<T>      llama_weights_;
 
     ft::NcclParam tensor_para_;
     ft::NcclParam pipeline_para_;
@@ -309,18 +294,18 @@ class FTLLaMA: public IFLLaMA {
 class LLaMA: public th::jit::CustomClassHolder {
 public:
     LLaMA(const int64_t            head_num,
-              const int64_t            size_per_head,
-              const int64_t            inter_size,
-              const int64_t            layer_num,
-              const int64_t            vocab_size,
-              const int64_t            rotary_embedding_dim,
-              const int64_t            start_id,
-              const int64_t            end_id,
-              const int64_t            tensor_para_size,
-              const int64_t            pipeline_para_size,
-              const int64_t            max_seq_len,
-              const bool               use_gptj_residual,
-              const vector<th::Tensor> weights);
+          const int64_t            size_per_head,
+          const int64_t            inter_size,
+          const int64_t            layer_num,
+          const int64_t            vocab_size,
+          const int64_t            rotary_embedding_dim,
+          const int64_t            start_id,
+          const int64_t            end_id,
+          const int64_t            tensor_para_size,
+          const int64_t            pipeline_para_size,
+          const int64_t            max_seq_len,
+          const bool               use_gptj_residual,
+          const vector<th::Tensor> weights);
 
     ~LLaMA();
 
@@ -339,7 +324,7 @@ class LLaMA: public th::jit::CustomClassHolder {
 
 private:
     const at::ScalarType    st_;
-    IFLLaMA*              ftllama;
+    IFLLaMA*                ftllama;
     std::vector<th::Tensor> weights;
 };
 

From 6ed374791f932f61ee2072d1d6cc623f2b066810 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 18 Sep 2023 09:11:32 +0000
Subject: [PATCH 18/55] llama-opt

---
 examples/cpp/llama/backup.csv                 |  32 ++++++
 examples/cpp/llama/llama_config.ini           |   3 +-
 examples/cpp/llama/llama_example.cc           | 107 ++++++++----------
 examples/cpp/llama/start_ids.csv              |  36 +++++-
 .../LLaMAContextAttentionLayer.cc             |  62 +++-------
 .../LLaMAContextAttentionLayer.h              |  22 +---
 src/fastertransformer/models/llama/LLaMA.cc   |  69 +++--------
 src/fastertransformer/models/llama/LLaMA.h    |  95 ++++++----------
 .../models/llama/LLaMAContextDecoder.cc       |  21 +---
 .../models/llama/LLaMAContextDecoder.h        |   9 +-
 10 files changed, 187 insertions(+), 269 deletions(-)
 create mode 100644 examples/cpp/llama/backup.csv

diff --git a/examples/cpp/llama/backup.csv b/examples/cpp/llama/backup.csv
new file mode 100644
index 000000000..eb28ed345
--- /dev/null
+++ b/examples/cpp/llama/backup.csv
@@ -0,0 +1,32 @@
+1, 14542,  3262,  8112, 29901,  7803,  1757,   526, 13407,   297,   263, 13569, 29889,  2688
+1,  7392,  1026, 29901,   319, 11379, 15028,   297,   263, 17948,  8693, 29889,   450, 11379
+1,  5057, 12500, 29901,   319,  8023,   338,  2734,  1623,   263,  5702, 29889,   450,  8023
+1, 17984, 18558, 29901,  1334,  1074,   263,   767, 13407,   297,   263,  5716, 29889,   450, 767
+1, 19509,   263,  1766, 25206, 29901, 11647,   526,  2734,   373,   278, 11952, 29889,   319, 767
+1,  7412,   292, 11565, 29901, 11647,   526, 16246,  5742,  6131, 13587,  3949, 18464, 29889, 11647
+1,  7412,   292, 10311, 26082, 29901,   319,   767,   338, 16246,  2768,   263,  5716, 29889, 940
+1,   323,  4524, 29901,   319,   767,   322,  6114,   526, 16246,   373,   263,  7408,  4208, 29889, 2688
+1,  8565,   375,  3183, 29901,   319,   767,   338, 13407,   701,   322,  8026,   263,  7679, 29889, 11647
+1, 12878,   292,   278, 11203, 29901,   319,  6114, 17042,  2039,   714, 11480,   278, 17455, 29889,  7803,  2319, 26361
+1,  5057, 12500, 29901,   319,  6114,   338, 22049,  3412,   263,  5702, 29889, 2296
+1,  8565, 11203, 29901,   319,  2919, 19174,   338, 22229,  2820,   263,  1746, 29889, 11647
+1,  8360,  5367, 29901,   319,  6114,   338,   409,   630,   472,   263,  1591, 29889,  2296
+1,   476,   484, 14067, 29901,   319,   767, 17905,  1379,   373,   263, 17132, 29889,   450, 767
+1,  7412,   292,   378, 25496, 29901,   319,   767,   338, 16246,  5742,  1023, 28987, 29889, 940
+1,  1706,   262,  1076, 29901,   319,   767,   338, 16246,   373,   385, 15058,  4768,   446, 29889, 940
+1,   390,  5086, 11308, 29901,   319,   767,   338,  1153,  9292, 11308,   297,   263, 29413, 29889, 940
+1,  7412,   292, 11210,   336, 29901,   319,   767,   715, 16926,   263, 21387,   964,   670, 11210, 29889, 940
+1,  7412,   292, 11210,   336, 29901,   319,  4123,   767,   269,  1169,   373,   263,  6592, 29889,   450, 767
+1, 28551,   292, 29901, 11647,   526, 13407,   373,   263, 17306,   310, 15007, 29889, 11647
+1,  8481, 24613,  1847, 29901,   319,   767,   338, 13407,   373,   263, 19587, 11952, 29889, 940
+1,  7412,   292,  1248, 29877, 29901, 11647,   526,  2381, 25217,   297,   278,  4094, 29889,  7803, 5866
+1,  5057, 12500, 29901,   319,   767, 15028,   297,   278,  7256,   310,   263, 10728,  1974, 29889, 29445, 6289
+1,  5057, 12500, 29901,   319,   767,   338,  4318,  2734,  1623,   263,  5702, 29889,   940
+1,  6781,  8522, 29901, 11647,   526, 13407,   373,   263,  1746,  9963, 29889,   450,  1757
+1,  8565, 11203, 29901,   319,  6114,   338,  8743,   411,   263, 11203, 29889,   450, 11203
+1,  3925, 25217, 29901,   319,  2381, 25217, 11565,   338,  4318,   297,   263,  5716, 29889, 7567
+1,  6163, 23131,   292, 29901, 12753,  2305,   748, 23131,   292,  1623,   263, 10952, 29889, 2688
+1,   399,   336,  3262, 22981, 29901,  1334,  1074,   263,  3800,   373, 18187, 29889,   319, 2022
+1, 28551,   292, 29901,   319,  2022,   338, 14993,   292,  1623,   263, 17306,   310, 15007, 29889, 2688
+1,   399,  1161,   292,  3700, 29901,   319,  6114,   338, 13407,   297,   263,  5716,  9963, 29889, 2296
+1,  2522, 11495,  1933,   292, 29901,   319,   767,   338,  1090,  4094,   297,   263, 11565, 29889, 940
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 1e92695e5..931b24e5d 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -6,8 +6,7 @@ pipeline_para_size=4
 
 
 [request]
-beam_width=1 # beam width for beam search
-request_batch_size=4 # determine by the request
+request_batch_size=32
 
 [llama_33B]
 head_num=52
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index a558bbf65..d2c8dcf51 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -189,13 +189,30 @@ void llama_example(const INIReader reader)
         mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD);
     }
 
-    AttentionType attention_type = getAttentionType<T>(size_per_head,
-                                                       getSMVersion(),
-                                                       true,   // remove_padding
-                                                       0,      // llama supports any-seq-length fmha
-                                                       true,   // is_fuse
-                                                       false,  // with_relative_position_bias
-                                                       true);  // causal_mask
+    AttentionType attention_type =
+        getAttentionType<T>(size_per_head,
+                            getSMVersion(),
+                            !((std::getenv("SHONG_PADDING") != nullptr)
+                             && (std::string(std::getenv("SHONG_PADDING")) == "ON")), //true,  // remove_padding
+                            0,      // llama supports any-seq-length fmha
+                            true,   // is_fuse
+                            false,  // with_relative_position_bias
+                            true);  // causal_mask
+
+    switch (attention_type) {
+        case AttentionType::UNFUSED_MHA:
+            std::cout << "UNFUSED_MHA\n";
+            break;
+        case AttentionType::UNFUSED_PADDED_MHA:
+            std::cout << "UNFUSED_PADDED_MHA\n";
+            break;
+        case AttentionType::FUSED_MHA:
+            std::cout << "FUSED_MHA\n";
+            break;
+        case AttentionType::FUSED_PADDED_MHA:
+            std::cout << "FUSED_PADDED_MHA\n";
+            break;
+    }
 
     LLaMA<T> llama = LLaMA<T>(head_num,
                               size_per_head,
@@ -239,7 +256,6 @@ void llama_example(const INIReader reader)
     cudaDeviceSynchronize();
     mpi::barrier();
 
-    cudaProfilerStart();
     // warm up
     ite = 1;
     ft_nvtx::setScope("warmup_time");
@@ -253,71 +269,39 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
 
-//    if (rank == world_size-1) {
-//        T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
-//        cudaMemcpy(
-//            out, d_output_logits, sizeof(T) * request_batch_size * total_output_len * vocab_size, cudaMemcpyDeviceToHost);
-//        for (int b = 0; b < request_batch_size; ++b) {
-//            std::cout << "[";
-//            for (int s = 0; s < total_output_len; ++s) {
-//                std::cout << "[";
-//                for (int v = vocab_size-8; v < vocab_size; ++v) {
-//                    std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " ";
-//                }
-//                std::cout << "]\n";
-//            }
-//            std::cout << "]\n";
-//        }
-//        std::cout << "\n";
-//    }
-
-    /*
-    if (rank == 0) {
-
-        std::string fName   = "out";
-        auto        outFile = std::ofstream(fName, std::ios::out);
-        if (!outFile.is_open()) {
-            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
-        }
-        else {
-            size_t outCount = total_output_len * request_batch_size;
-            int*   hBuf     = new int[outCount];
-            cudaD2Hcpy(hBuf, d_output_logits, outCount);
-
-            {
-                std::cout << "Writing " << outCount << " elements\n";
-                int zeroCount = 0;
-                for (size_t i = 0; i < outCount; i++) {
-                    if (hBuf[i] == int(0)) {
-                        zeroCount++;
-                    }
-                    outFile << hBuf[i] << " ";
-                    if ((i + 1) % (total_output_len) == 0) {
-                        outFile << std::endl;
-                    }
-
-                    if (i < 10) {
-                        printf("%5d ", hBuf[i]);
-                    }
-                    if ((i + 1) % (total_output_len) == 0 && i < 10) {
-                        std::cout << std::endl;
-                    }
+    if (rank == world_size - 1) {
+        T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
+        cudaMemcpy(out,
+                   d_output_logits,
+                   sizeof(T) * request_batch_size * total_output_len * vocab_size,
+                   cudaMemcpyDeviceToHost);
+        for (int b = 0; b < request_batch_size; ++b) {
+            std::cout << "[";
+            for (int s = 0; s < total_output_len; ++s) {
+                std::cout << "[";
+                for (int v = vocab_size - 8; v < vocab_size; ++v) {
+                    std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " ";
                 }
-                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+                std::cout << "]\n";
             }
-            delete[] hBuf;
+            std::cout << "]\n";
         }
+        std::cout << "\n";
+        free(out);
     }
-    */
 
     // test time
+    cudaProfilerStart();
     struct timeval start, end;
-    mpi::barrier();
     cudaDeviceSynchronize();
+    mpi::barrier();
+
     gettimeofday(&start, NULL);
 
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
+    // warm up
+    ite = 3;
     for (int i = 0; i < ite; ++i) {
         llama.forward(&output_tensors, &input_tensors, &llama_weights);
     }
@@ -328,7 +312,6 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
     gettimeofday(&end, NULL);
-
     cudaProfilerStop();
 
     printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d"
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
index a74083153..58bc4b4f6 100644
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
@@ -1,4 +1,32 @@
-1, 14542,  3262,  8112, 29901, 7803,  1757,  526, 13407,   297,  263, 13569, 29889,  2688,  526, 13587,  701, 27815, 29889,     0
-1,  5057, 12500, 29901,   319, 8023,   338, 2734,  1623,   263, 5702, 29889,   450,  8023, 6057,   964,  263,  1559, 29889,     0
-1,  5057, 12500, 29901,   319, 8023,   338, 2734,  1623,   263, 5702, 29889,   450,  8023, 4947,   297,  263,  1775, 29889,     0
-1, 28551,   292, 29901, 11647,  526, 13407,  373,   263, 17306,  310, 15007, 29889, 11647,  526,  1985, 2768,   263,  5214, 29889
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   432, 809,   295,   719, 27372, 29889,  7605,   263,   286,   789,  5941, 292, 10823,   363,   596,  3632,   331,  1943,   432,   809,   295, 719, 27372,  9522,   412, 29889,   360,   728, 29559,   411,  1395, 559, 29899,  7582,  1259,  4426,   508,   367,  1304, 29892,   541, 14383,   270,   728, 29559,   411,  4023,   845,  9418, 29899, 29890, 5761,   616,  4426,   408,   445,   508, 17820,   278,  8341,  1283, 432,   809,   295,   719, 28001 , 29889
+1,  3201,   955, 29901,  1128,   304,   679,   304,   413,  4442, 340, 29889,   315,   905,   263,  1513, 16286,   304,   413,  4442, 2165,  4799,   637,   313, 29926,  6547,  3300,   352, 13607,  6121, 4799,   637, 29897,   515, 29129,  1450,   470,  4655, 14721,   273, 14368, 29892,  1316,   408,   286,   348,   436, 29892,   301,   898, 265, 29892,  1226, 23559, 29892, 10395,  2429, 29892,   282,  1431, 434, 29892,   610,   275,   470,  7655,  1915, 29889,  1704, 26536, 3160,  3287,  1248,   728,  4799,  9012
+1,  8778,   322, 19906, 29901,  1128,   304, 12566,   330,  2390, 267,   515, 17564, 29879, 29889, 26428,   596,  2646,   412,   325, 1475,   411,  2691, 27716,  7787,  1259,   304, 12566,   278,   330, 2390,   267, 29889,   450, 27716,   881,   367,  1546, 29871, 29900, 29889, 29945,   304, 29871, 29900, 29889, 29947,  3533, 17528,   690, 313, 29900, 29889, 29900, 29906, 29900,   304, 29871, 29900, 29889, 29900, 29941, 29896,   297, 29897,   304, 12566,   278,   330,  2390, 267,   515,   285,  3687, 29892, 25550
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  6548,  5520, 321, 29891,   295,  1161,   267,   322,  2989,   261,   321, 29891, 774,   798, 18180, 29889, 15154,  1207,   786,  4646,   368, 29889, 341,  6151,  2518,   322,   321, 29891,   774,   798,  9127, 29892, 2175,   373,   321, 29891,   295,  1161,   267,   322,   321, 29891, 774,  5727,   975, 11147,   674, 18658,  1438, 15409,  2578, 25414, 322,   674,  5557,   321, 29891,   295,  1161,   267,   322,   321, 29891,   774,  5727,   515, 15678,   636
+1, 26040, 29901,  1128,   304,  1207,  3632,   331,  1943,  6635, 1634,   514,   296, 29889,  3462,   278, 18853, 17182,   304,   263, 805,   764, 18046,   280, 29889,  1152,   278,  1634,   514,   296, 29892,   366, 29915,   645,   817,   263, 29871, 29906, 29899, 21543, 313, 29945, 29929,   286, 29880, 29897, 12917,   805,   764, 18046, 280, 29889,   317,   802, 29872,   911, 29871, 29906,  4441,   567, 310,   454,  3712, 18853, 17182,  1919, 29871, 29906,  4441,   567, 310,  8775, 24841, 18853, 17182, 29892
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,  7375, 322,   269,   473, 22300, 29889,   323,  2209,   278,   282,   548, 658,   262,   411,   278,  6501,   577, 29891, 12507,   346, 29889, 3462, 29871, 29945, 29871,  1309,   778,   313, 29896, 29946, 29906, 330, 29897,   310, 10814,  6393,   282,   548,   658,   262,   393, 29915, 29879,  1063,  5700,   297, 29871, 30515, 29899, 22466,   313, 29953, 29899,  4317, 29897, 12003, 10076,   567,   322, 29871, 29906, 734,   294,  1129,   787,   313, 29896
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304, 19417, 325,   524,   482,  3438,  2017,   432,   809,   295,   719,   373, 18230,   388, 29889, 29301,  1432,   325,   524,   482,  3438,  2017, 432,   809,   295,   719, 10754,  2909,   322,  1432,   325,   524, 482,  3438,  2017,   432,   809,   295,   719,  1856,  3268,   366, 508,  1284, 29889,  6280,  4447,   675,  7535,   411,  3785, 11949, 29892,   278,   664,   310,  1532,  2998,  2874,   414, 29892, 12713, 22848, 29892,  1539,  1338, 29892, 25702
+1, 16224, 10057,   322, 22135, 29901,  1128,   304, 19531,   260, 4227, 29880,  1600,   384,  7901, 10412, 29889, 14542,   852,   263, 19875, 29891,   470,  1045,  3594,   260,  4227, 29880,  1600,   384, 363,   263,   901,  3209,   950,  1106, 29889,   319, 12003, 29892, 1302,  1537,   260,  4227, 29880,  1600,   384,  7901,  1008,   322, 263,  5101,   310,  1320,  1253,   287,  1444,   550,   338,   278, 4922,   982,   304,  7952, 14294,   373,   263, 11220,  4723,   355, 29889,  7357,   523,  2814,   470,   260
+1, 25453,   322, 17465,   292, 29901,  1128,   304, 17545,   901, 330,  2390,   267, 29889,  3462,   330,  2390,   267,   304,   596, 4497,   328, 29889,   319,  5972,   322,  4780,   982,   304,  7910, 278,  5253,   310,   330,  2390,   267,   297,   596, 14218,   652, 300,   338,   304, 28189,   263,  2846,  8870,  1490,   330,  2390, 267,   373,  2246,   310,   263,   301,  3322, 29899,   272,   270, 2559,   814,   603,  4497,   328, 29889,   450, 14225, 21054,   272, 322,  7990, 18459,   310,   278,   330
+1, 15202, 29901,  1128,   304, 11039,   403, 18655,  1849,   964, 263,  9045, 29891, 26044, 29889,  8561,   263, 18655,   519,   885, 2572,   569,   363, 26044, 29889,   319, 18655,   519,   885,  2572, 569,   338,   263,  2560,   270,   728,  1754,   491,   872,   329, 29948,   292, 18655,  1849,   297,   263,  4091,   340,  7243,   322, 769,  4417,   367,  2579, 29808,   975,   963,   304,  4808,   278, 270,   728,  4208, 29889, 14893,   491,  4417,   738, 18655,  1849, 366,   763, 29892,  3704,   373,  1080
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   263, 3632,   331,  1943,  9045, 29891,  3700,   471, 29882, 29889,   422, 26062,   599,   310,   278,  2348,  1127, 10070, 29889,   512,   263, 18350, 29899, 29879,  1891, 12580, 29880, 29892,  6837,  4208, 29871, 30226, 18002,   313, 29946, 29945,   330, 29897,   310, 29081,   288, 1446, 29892, 29871, 30515, 18002,   313, 29945, 29929,   286, 29880, 29897, 10849,   454,  3712,  3623,   625, 29892, 29871, 30515, 18002, 313, 29945, 29929,   286, 29880, 29897
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  1207, 18655, 13956,   286,  1878,  8345,  8310, 29891, 29889,  5701,  1082, 278,   373,   291,   322,   286,  1878, 18901, 29889,   940,   271, 29871, 30226, 18002,   313, 29896, 29906, 29900,   286, 29880, 29897, 310,  4805, 29899,  2405,  5359,   288,  9258, 17182,   297,   263, 2919, 12507,   346,  8357,   975, 18350, 29899,  9812, 12871, 29889, 9038,   278, 17182,   528,  6727,   414, 29892,   788, 29871, 30226, 310,   263,  2319,   373,   291,   393
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  6958, 10992,  2963, 29889, 15484,   263,  1246,   363,   278, 19075,   982, 304,  6159, 10992,  2963, 29889,   960,   366,   723,   763,   304, 505,   263,  2022, 29899,   517, 29899, 10532, 14983, 29892,   270, 616,   278,  2498,   297,  6578,  2722,  1196,   472, 29871, 29896, 29899, 29947, 29900, 29900, 29899, 29953, 29953, 29947, 29899, 29953, 29955, 29953, 29945, 29889,  2688, 29915,   276,  1722,  7398,   388, 304,  1424, 22394, 29871, 29955,   263
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  8267,   596, 321, 29891,   774,  5727,   411,   263,  8006,   272, 29889,   349, 27574,   263,  8006,   272,  1754, 10816,   363,   321, 29891,   774, 5727, 29889,   319,  3918,  8006,   272,   674,   451,  2367,   366, 13173,  3347, 29879, 29892,   322,   508,   367, 18215,   304,   671, 2978,   596,  5076, 29889,  8669, 20590,   385,   321, 29891,   774, 798,  8006,   272, 29892,  5069,  2319, 12995,   311,   674,  2367, 366,   278,  3347, 29879,   366, 13521
+1, 15202, 29901,  1128,   304,  5040, 23023,  1848,   321,  1218, 29889,   360,  8349,  7268,   403,  1546,  9128, 18757,   261,   322, 23023,  1848, 18757,   261, 29889,  1763, 18720,   278,  9946,   310, 596, 23023,  1848,   321,  1218, 29892,   372,  1122,   367,  5407, 304,   937,  2274,   746,   366,   526, 11223,  4824,  1711,  9074, 14793,   322,   746,   366,   526, 11223,   953,  8194,   635,  9074, 14793, 29889, 26991, 29892, 23023,  1848, 18757,   261,  5304,   373, 11584,   322, 23880,  5065,  5362, 29889
+1,  4231,   749,   322, 15197, 29901,  1128,   304,   289,  5790, 1044,  4856, 29889,  7519, 29883,   403,  7535,  1048,   278,   289, 5790,  1044, 29889,   450,   289,  5790,  1044, 29892,   884,  2998, 408,   278, 23729,  1458, 19119,  9045,  1044,   313,  1579,   272, 1458,  1002,  1082, 16385, 29871, 29941, 29929, 29946, 29892,   760, 474, 29897,   338,   263,  4307,   393,   471,  4502,   304,  9801, 322,  1072,  5987, 11176, 14703, 19119,  9045, 14502,  5786,   363, 1906, 23164,   515,   263, 19119,  4486
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   263, 5613, 15774, 23895,  2017, 29889,  4007,  6967,   278,  2348,  1127, 10070, 29889,  1152,   445,  9522,   412, 29892,   366,   674,   817, 278,  1494,  4452,   584, 29871, 29896,  2894,   293,   274,  2559, 314,   265, 12070,  1919, 29871, 29906, 29945,  2894,   293,  5881, 314,   290,  2532, 29879,  1919, 29871, 29896, 29945,  2894,   293, 17184,  1960,  1919, 29871, 29896,  2894,   293,  1109,  2911, 17796, 1919, 29871, 29896, 10849,  2894,   293
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,   471, 29882, 4105,  4841, 29889, 14542,   852,   278,  1492,   528,   314,  1129, 29877,   322,  4195,   261, 29889,  5806,   738,   528,   314,  1129, 29877,   470,  4195,   261,   674,   664, 29892,   372,   338,  2253, 304,   671,  2730,   391,   332,  5281,   528,   314,  1129,   359, 322,  4195,   414, 29892,  7148,   565,   596,  8716, 29886,   338, 15589,   322,   372, 23766, 29889,  3834,  9316,  1316,   408,  1183, 29874,  2730,   391,   545,   263,  1341
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  1207, 263, 19408,   413,  3780, 12343,  7539,   282,  5863, 29889, 10306, 278, 11994,   363,  3907,   413,   295,  1188, 29887, 29915, 29879, 19408,   413,  3780, 29886,   583,  2578,  1446,  2441, 29889,  1670, 674,  3117,   367,  1048,  4203,   263,  9853,   310, 19408,   413, 3780, 29886,   583,  7539,  2175, 29889,   313,   697,   310,   278, 2625, 23633,   310,  1641,   278,  7984, 29892,   338,   366,   679, 304, 17545,   738, 29915,   454, 29888
+1,   349,  1691,   322, 24980,  1338, 29901,  1128,   304,   260, 4003,  8343,   263,  2653, 23717, 29889,   402,  1624,   596, 28075, 29889,   887,   674,   817,   263, 29871, 29896, 29906, 21759,   269, 4316, 19144, 29892,   263,  4964, 14051,   495,  8343,   292,   260, 4003, 29892,   322,   263, 29871, 29896, 29953, 29899, 22466,   318, 276,   386,  1705,   274,   493,  1308,   411,   263, 24235,   310, 29871, 29945,   285,  4615,   313,  1454,  2319, 26361, 29897,   322, 29871, 29947,   285,  4615,   313,  1454
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  2867,   297, 2373,   296,   454,  1624, 17394,   267, 29889,  5373, 29891, 17394, 267,   393,  6216,  1532, 29889,  3080,   326,   675,   278,   817, 363, 16116,   292,   470, 16679,   297,   491,  2805,  2373,   296, 454,  1624, 17394,   267,   393,   526,  2307,   263,  1781,  6216, 363,   366, 29889,  4001,  2373,   296,   454,  1624,   338,   380, 2593,   322, 29395,   990,  4357, 29892,   366, 29915,   276,   451, 2675,   304,   367,  2221,   304,  1735
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  9563,   470, 2329,   263,   528, 10511,  1283,   321, 29891,   774,   798, 29889, 14542,   852,   385,   321, 29891,   774,   798,   282,  3977,   309, 322,  4764,   672,   393,   338,  2788,   304,   596,  5613,  2927, 292, 29889,   960,   366,  1603,   505,   697,   310,   596,   321, 29891,   774,  5727, 29892,   445,  1795,   367,   263,  2217,  6775, 489,  5143,  1993,   278,   282,  3977,   309,   304,   278,   528, 1943,   310,   596,   321, 29891,   774
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  5376, 411,   540,  5031, 23448,   263, 29889, 19530, 29876,   278, 25828, 4835, 29889,   940,  5031, 23448,   263,   756,   263,  1353,   310, 25828,  4835,   393, 12234,  2615,  1546,  1023,   322,  4832, 11405, 515,   278,  2635,   310, 14060,   545, 29889,  3834,   310,  1438, 25828,  4835,   526, 10035, 29892,   763,   263,  1238,   369, 29892, 1550,  4045, 29892,   763,   432,   585,   299,   625, 29892,   526, 2649, 29873,   744, 18906,   310,   540
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  4529,   596, 923,  1416, 29890,  2873, 29889, 23868,   278,  1492, 11955, 29889, 887,   674,   817,   472,  3203,  1023,  1422,   528,  3076,   310, 1207,   786, 29901,   697,   393,   338, 16951,  6501,   261,  1135, 596,  5613, 19309, 16225,   363,   278,   528, 23626,   322,   697, 393,   338,   925,   263,  2217,   301, 14643,  1135,   596, 19309, 363,   278, 12141, 29879, 29889, 28277,   373,   596, 19309, 16225, 322, 24583, 29892,  1438,   508,   367
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,   671,   429, 4542, 29875,  1218,   528,   314,  1129, 29877, 29889,   317,   802, 29872,   911,   263, 12616, 29899, 29879,  1891,  8828,  4757,   310, 528,   314,  1129, 29877,   304,   596,  5112, 29885, 29889,   319, 2217,  2586,   310,   429,  4542, 29875,  1218,   528,   314,  1129, 29877,  5771,   263,  1472,   982, 29889,  2860,  7990,  1259,   596, 11315,   297,   278,  1510,   261,   408,   366, 12891,   723, 29892, 269,   802, 29872,   911,  1048,   263
+1, 25453,   322, 17465,   292, 29901,  1128,   304,  1207,   521, 332,   307, 26163,  5036, 12580,  3137, 29889,  4721,   354,   271, 278,   288,   854,   304, 29871, 29946, 29945, 29900,  6719,   285, 21446,  6884,   470, 29871, 29906, 29906, 29945,  6719,  6432,  1039, 375,   636,  4122,   559,   263,   286,  3096,   262,   260,   764, 491,   285,   492,  3262,   372,   373,   967,  2625, 29889,  8669, 310,   805,   764,   292,  1661, 29899,   303,   860,  1395,  5832, 805,   764,   297,   278,  4251,   310
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  6483, 285,   719,   541,   725,   329, 10674,  1161, 29889, 14542,   852, 385, 17182, 29889,  1932, 23906,   385, 17182,   304,  6483,   285, 719,   596,   541,   725,   329, 10674,  1161,   297, 29892,   372, 29915, 29879,  4100,   304,  5839,   697,   393,   756,   263,  6133, 25158,  1298,  1135,   278,  7984,   292, 10430, 29889,  1152,  1342, 29892,   565,   366,   505,   263,  9687,   393,  4225,   304,   367, 7984,   287,   472, 29871, 29941, 29945
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   385, 288,  9258, 17182,  2730,   391,   332,  3950, 29889,   349, 27574, 953,  7273,  9215,   281,  1165,   515,   263,  3240,   737,   261, 393,  4266,  7093,   297, 28075,   363,  3907,  3632,   331,  1943, 6776,  2527,  1199, 29889,  1670,   526,  1784,  5376,   414,   393, 508,   367,  1476,  7395,  1058, 19417,   953,  7273,  9215,   281, 1165,   636,  5373, 29891,   777, 18853,   288,  2719,   393,   366, 723,   763,   304,   671,   297,   596
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  5376, 411, 14919, 21549, 29889,  1260,  8332,   403, 14919, 21549, 29899, 513,  1682,   292,  9687,   322, 29914,   272, 13748,   515,   596, 652,   300, 29889,   739, 10083,  2560, 29892,   541,  6480,   825, 366,  2348,   342, 14218,   508,   505,   263, 12176, 10879,   373, 596, 14919, 21549, 11174, 29889,   960,   366,  8369,  7535, 11223, 24937, 29892,  7243, 18219, 29892,   470,   851, 11517,  1432,  2462, 29892,  3814,   304,  2334,   472,  3203
+1, 16224, 10057,   322, 22135, 29901,  1128,   304, 19531,  3708, 552, 17441,   303,   860, 29889, 14542,   852, 12528,   301,   309, 562,   470, 22181,  1581, 17441,   303,   860,   363,  6534, 19309, 260,  2873, 29889,  7419,   363,   301, 14643, 29899,  2780,   287, 3708,   552, 17441,   303,  7358,   411,  7254, 22332,  2873, 29892, 1316,   408,   540,  1624,   470,  3805,   275,   528,  3076, 29892, 304,  1035,   296, 27240,   278,  7254, 22332,  2873,   297,   596, 15509, 19309, 29889,  4525,   674, 19595
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   596, 269,   484, 21079,  1106,   716,  1449, 29889,   399,  1161, 10508, 269,   484, 21079,   297,   278,   471,  2790,  4933, 29889,   960, 366,   505,   777, 26616, 10508,   269,   484, 21079, 29892,   366, 508,  5948,   679,   963,  5941,   491, 17452,   963,   297,   278, 471,  2790,  4933, 29892,   925,   408,   366,   723,   738,   916, 26616,  7171,   358, 29889,   887,  1122,   884,   367,  2221,   304, 471, 29882,   777,  1661, 29899, 15257
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  6755,   263, 18870,   262,   398,  9228, 29889, 14542,   852,   263,  9228,   411, 1880,  3708,   537, 29889,  1094,   411,   599,   758,  8802,  1539, 1338, 29892, 18870,   262,   398,  1818,   367,   394,  2376,   287, 411,   916,  1539,  1338,   297,  1797,   304,  6176,   278,  2898, 2264,  3734,   363,   432,   809,   295,   719, 29889,   739,   338, 4049,   394,  2376,   287,   411,  1661, 29899,  1457,  8802,  1539, 1338,   763,  1302,  2496,   470,   274
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 10e39fd39..bc917df72 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -98,7 +98,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    head_num_,
                                    size_per_head_,
                                    rotary_embedding_dim_,
-                                   neox_rotary_style_,
+                                   false,
                                    attention_weights->query_weight.scale_out,
                                    0,  // int8_mode
                                    stream_);
@@ -294,20 +294,15 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           cublasMMWrapper* cublas_wrapper,
                                                           IAllocator*      allocator,
                                                           bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float,
-                                                          bool             sparse,
-                                                          int              int8_mode):
-    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+                                                          bool             is_qk_buf_float):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
     max_batch_size_(max_batch_size),
     max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
     rotary_embedding_dim_(0),
-    neox_rotary_style_(false),
-    is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
-    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
+    is_qk_buf_float_(is_qk_buf_float)
 {
 }
 
@@ -321,20 +316,15 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           cublasMMWrapper* cublas_wrapper,
                                                           IAllocator*      allocator,
                                                           bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float,
-                                                          bool             sparse,
-                                                          int              int8_mode):
-    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+                                                          bool             is_qk_buf_float):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
     max_batch_size_(max_batch_size),
     max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
     rotary_embedding_dim_(0),
-    neox_rotary_style_(false),
-    is_qk_buf_float_(is_qk_buf_float || int8_mode == 2),
-    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
+    is_qk_buf_float_(is_qk_buf_float)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
@@ -347,25 +337,19 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           size_t           size_per_head,
                                                           size_t           local_head_num,
                                                           size_t           rotary_embedding_dim,
-                                                          bool             neox_rotary_style,
                                                           cudaStream_t     stream,
                                                           cublasMMWrapper* cublas_wrapper,
                                                           IAllocator*      allocator,
                                                           bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float,
-                                                          bool             sparse,
-                                                          int              int8_mode):
-    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse),
+                                                          bool             is_qk_buf_float):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
     max_batch_size_(max_batch_size),
     max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
     rotary_embedding_dim_(rotary_embedding_dim),
-    neox_rotary_style_(neox_rotary_style),
-    is_qk_buf_float_(is_qk_buf_float),
-    weight_only_int8_fc_runner_(int8_mode == 1 ? std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>() : nullptr),
-    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
+    is_qk_buf_float_(is_qk_buf_float)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
@@ -376,18 +360,14 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionL
     BaseAttentionLayer<T>(attention_layer.stream_,
                           attention_layer.cublas_wrapper_,
                           attention_layer.allocator_,
-                          attention_layer.is_free_buffer_after_forward_,
-                          attention_layer.sparse_),
+                          attention_layer.is_free_buffer_after_forward_),
     max_batch_size_(attention_layer.max_batch_size_),
     max_seq_len_(attention_layer.max_seq_len_),
     head_num_(attention_layer.head_num_),
     size_per_head_(attention_layer.size_per_head_),
     hidden_units_(attention_layer.hidden_units_),
     rotary_embedding_dim_(attention_layer.rotary_embedding_dim_),
-    neox_rotary_style_(attention_layer.neox_rotary_style_),
-    is_qk_buf_float_(attention_layer.is_qk_buf_float_),
-    weight_only_int8_fc_runner_(attention_layer.weight_only_int8_fc_runner_),
-    int8_fc_runner_(attention_layer.int8_fc_runner_)
+    is_qk_buf_float_(attention_layer.is_qk_buf_float_)
 {
 }
 
@@ -408,25 +388,25 @@ template<typename T>
 void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true);
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true);
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
     }
-    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
 
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
@@ -451,12 +431,6 @@ void LLaMAContextAttentionLayer<T>::freeBuffer()
             allocator_->free((void**)(&qk_buf_float_));
         }
 
-        allocator_->free((void**)(&mixed_gemm_workspace_));
-        mixed_gemm_ws_bytes_ = 0;
-
-        allocator_->free((void**)(&int8_gemm_workspace_));
-        int8_gemm_ws_bytes_ = 0;
-
         is_allocate_buffer_ = false;
     }
 }
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index e9086e278..635d3d15a 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -36,7 +36,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     const size_t           size_per_head_;
     const size_t           hidden_units_;
     const size_t           rotary_embedding_dim_;
-    const bool             neox_rotary_style_;
 
     // fmha runner
     int                        sm_ = getSMVersion();
@@ -52,13 +51,9 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
 
     bool is_qk_buf_float_;
 
-    std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
-    std::shared_ptr<CutlassInt8GemmRunner<T>>             int8_fc_runner_;
-
 protected:
     using BaseAttentionLayer<T>::allocator_;
     using BaseAttentionLayer<T>::stream_;
-    using BaseAttentionLayer<T>::sparse_;
     T*     qkv_buf_              = nullptr;
     T*     q_buf_2_              = nullptr;
     T*     k_buf_2_              = nullptr;
@@ -67,10 +62,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     float* qk_buf_float_         = nullptr;
     T*     qkv_buf_2_            = nullptr;
     T*     qkv_buf_3_            = nullptr;
-    char*  mixed_gemm_workspace_ = nullptr;
-    size_t mixed_gemm_ws_bytes_  = 0;
-    char*  int8_gemm_workspace_  = nullptr;
-    size_t int8_gemm_ws_bytes_   = 0;
 
 public:
     LLaMAContextAttentionLayer(size_t           max_batch_size,
@@ -81,9 +72,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                cublasMMWrapper* cublas_wrapper,
                                IAllocator*      allocator,
                                bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float,
-                               bool             sparse    = false,
-                               int              int8_mode = 0);
+                               bool             is_qk_buf_float);
 
     LLaMAContextAttentionLayer(size_t           max_batch_size,
                                size_t           max_seq_len,
@@ -94,9 +83,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                cublasMMWrapper* cublas_wrapper,
                                IAllocator*      allocator,
                                bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float,
-                               bool             sparse    = false,
-                               int              int8_mode = 0);
+                               bool             is_qk_buf_float);
 
     LLaMAContextAttentionLayer(size_t           max_batch_size,
                                size_t           max_seq_len,
@@ -104,14 +91,11 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                size_t           size_per_head,
                                size_t           local_head_num,
                                size_t           rotary_embedding_dim,
-                               bool             neox_rotary_style_,
                                cudaStream_t     stream,
                                cublasMMWrapper* cublas_wrapper,
                                IAllocator*      allocator,
                                bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float,
-                               bool             sparse    = false,
-                               int              int8_mode = 0);
+                               bool             is_qk_buf_float);
 
     LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer);
 
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 02d46b5b9..e0d8d1c99 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -31,7 +31,6 @@ void LLaMA<T>::initialize()
                                                         inter_size_,
                                                         num_layer_,
                                                         rotary_embedding_dim_,
-                                                        neox_rotary_style_,
                                                         layernorm_eps_,
                                                         pipeline_para_,
                                                         stream_,
@@ -39,9 +38,7 @@ void LLaMA<T>::initialize()
                                                         allocator_,
                                                         is_free_buffer_after_forward_,
                                                         is_context_qk_buf_float_,
-                                                        attention_type_,
-                                                        custom_all_reduce_comm_,
-                                                        enable_custom_all_reduce_);
+                                                        attention_type_);
 }
 
 template<typename T>
@@ -61,22 +58,14 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_
         input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false));
     decoder_output_buf_ =
         (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
-    logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * vocab_size_, false));
-    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batch_size, false));
+    //logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * vocab_size_, false));
 
-    key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
+    key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
     value_cache_ = key_cache_ + self_cache_size;
 
     tiled_input_ids_buf_ =
-        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, true));
-    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, true));
-
-    transposed_output_ids_buf_ =
-        (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true));
-    output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batch_size * max_seq_len, true));
-
-    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
-    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
+        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, false));
+    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false));
 
     context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
         context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
@@ -92,8 +81,7 @@ void LLaMA<T>::freeBuffer()
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
         allocator_->free((void**)(&decoder_output_buf_));
-        allocator_->free((void**)(&logits_buf_));
-        allocator_->free((void**)(&sequence_lengths_));
+        //allocator_->free((void**)(&logits_buf_));
 
         allocator_->free((void**)(&key_cache_));
         if (cache_indirections_[0] != nullptr) {
@@ -103,11 +91,6 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&tiled_input_ids_buf_));
         allocator_->free((void**)(&tiled_input_lengths_buf_));
 
-        allocator_->free((void**)(&transposed_output_ids_buf_));
-        allocator_->free((void**)(&output_ids_buf_));
-        allocator_->free((void**)(&start_ids_buf_));
-        allocator_->free((void**)(&end_ids_buf_));
-
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
 
@@ -128,9 +111,7 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                 IAllocator*                         allocator,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
-                AttentionType                       attention_type,
-                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                AttentionType                       attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -161,9 +142,7 @@ LLaMA<T>::LLaMA(size_t                              head_num,
                 IAllocator*                         allocator,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
-                AttentionType                       attention_type,
-                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                AttentionType                       attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -173,8 +152,6 @@ LLaMA<T>::LLaMA(size_t                              head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     hidden_units_(head_num * size_per_head),
     pipeline_para_(pipeline_para),
-    custom_all_reduce_comm_(custom_all_reduce_comm),
-    enable_custom_all_reduce_(enable_custom_all_reduce),
     attention_type_(attention_type)
 {
     initialize();
@@ -191,8 +168,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     rotary_embedding_dim_(llama.rotary_embedding_dim_),
     hidden_units_(llama.hidden_units_),
     pipeline_para_(llama.pipeline_para_),
-    custom_all_reduce_comm_(llama.custom_all_reduce_comm_),
-    enable_custom_all_reduce_(llama.enable_custom_all_reduce_),
     attention_type_(llama.attention_type_)
 {
     initialize();
@@ -205,20 +180,6 @@ LLaMA<T>::~LLaMA()
     freeBuffer();
 }
 
-template<typename T>
-void LLaMA<T>::registerCallback(callback_sig* fn, void* ctx)
-{
-    token_generated_cb_  = fn;
-    token_generated_ctx_ = ctx;
-}
-
-template<typename T>
-void LLaMA<T>::unRegisterCallback()
-{
-    token_generated_cb_  = nullptr;
-    token_generated_ctx_ = nullptr;
-}
-
 template<typename T>
 void LLaMA<T>::forward(std::vector<Tensor>*       output_tensors,
                        const std::vector<Tensor>* input_tensors,
@@ -274,8 +235,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        max_seq_len);
     }
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
-    allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length);
 
+    allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length);
     sync_check_cuda_error();
 
     const DataType            data_type          = getTensorType<T>();
@@ -288,11 +249,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const std::vector<size_t> self_v_cache_shape = {
         num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_};
 
-    // initialize the output ids and parent ids
-    cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * max_seq_len, stream_);
-    sync_check_cuda_error();
-
-    // handle first step
     invokeTileGptInputs(tiled_input_ids_buf_,
                         tiled_input_lengths_buf_,
                         input_tensors->at("input_ids").getPtr<int>(),
@@ -304,7 +260,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     sync_check_cuda_error();
 
     invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                                             output_ids_buf_,
+                                             nullptr,
                                              llama_weights->pre_decoder_embedding_table,
                                              llama_weights->position_encoding_table,
                                              pPromptTuningParam<T>{},  // no p/prompt tuning
@@ -347,7 +303,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
-        T* output_logits = output_tensors->at("output_logits").getPtr<T>();
         invokeGeneralLLaMALayerNorm(context_decoder_input_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
@@ -357,6 +312,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
+
+        // FIXME: debugging
+        T *output_logits =  output_tensors->at("output_logits").getPtr<T>();
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
                               vocab_size_,
@@ -367,6 +325,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                               context_decoder_input_buf_,
                               hidden_units_,  // n
                               output_logits,
+                              //logits_buf_,
                               vocab_size_);
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 3b7995927..26d1a6696 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -36,27 +36,22 @@ class LLaMA: public BaseLayer {
     size_t vocab_size_;
     size_t rotary_embedding_dim_;
 
-    static constexpr bool  neox_rotary_style_ = false;
-    static constexpr float layernorm_eps_     = 1e-6f;
+    static constexpr float layernorm_eps_ = 1e-6f;
 
     size_t hidden_units_;
 
     NcclParam tensor_para_;
     NcclParam pipeline_para_;
 
-    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
-    int                                 enable_custom_all_reduce_;
-
     AttentionType attention_type_;
 
     const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
                                            || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
-    LLaMAContextDecoder<T>*    llama_context_decoder_;
+    LLaMAContextDecoder<T>* llama_context_decoder_;
 
     void allocateBuffer() override;
-    void allocateBuffer(
-        size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
+    void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
     void freeBuffer() override;
 
     void initialize();
@@ -67,64 +62,49 @@ class LLaMA: public BaseLayer {
 
     float* logits_buf_;
 
-    int*      sequence_lengths_          = nullptr;
-
     T*   key_cache_;
     T*   value_cache_;
     int* cache_indirections_[2] = {nullptr, nullptr};
 
-    int*  tiled_input_ids_buf_;
-    int*  tiled_input_lengths_buf_;
-    int*  transposed_output_ids_buf_;
-    int*  output_ids_buf_;
-    int*  start_ids_buf_;
-    int*  end_ids_buf_;
+    int* tiled_input_ids_buf_;
+    int* tiled_input_lengths_buf_;
 
-    T*     context_decoder_input_buf_;
-    T*     context_decoder_output_buf_;
-
-    // function pointer callback
-    using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
-    callback_sig* token_generated_cb_  = nullptr;
-    void*         token_generated_ctx_ = nullptr;
+    T* context_decoder_input_buf_;
+    T* context_decoder_output_buf_;
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
 
 public:
-    LLaMA(size_t                              head_num,
-          size_t                              size_per_head,
-          size_t                              inter_size,
-          size_t                              num_layer,
-          size_t                              vocab_size,
-          size_t                              rotary_embedding_dim,
-          unsigned long long                  random_seed,
-          cudaStream_t                        stream,
-          cublasMMWrapper*                    cublas_wrapper,
-          IAllocator*                         allocator,
-          bool                                is_free_buffer_after_forward,
-          cudaDeviceProp*                     cuda_device_prop         = nullptr,
-          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
-          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
-
-    LLaMA(size_t                              head_num,
-          size_t                              size_per_head,
-          size_t                              inter_size,
-          size_t                              num_layer,
-          size_t                              vocab_size,
-          size_t                              rotary_embedding_dim,
-          unsigned long long                  random_seed,
-          NcclParam                           tensor_para,
-          NcclParam                           pipeline_para,
-          cudaStream_t                        stream,
-          cublasMMWrapper*                    cublas_wrapper,
-          IAllocator*                         allocator,
-          bool                                is_free_buffer_after_forward,
-          cudaDeviceProp*                     cuda_device_prop         = nullptr,
-          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
-          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+    LLaMA(size_t             head_num,
+          size_t             size_per_head,
+          size_t             inter_size,
+          size_t             num_layer,
+          size_t             vocab_size,
+          size_t             rotary_embedding_dim,
+          unsigned long long random_seed,
+          cudaStream_t       stream,
+          cublasMMWrapper*   cublas_wrapper,
+          IAllocator*        allocator,
+          bool               is_free_buffer_after_forward,
+          cudaDeviceProp*    cuda_device_prop = nullptr,
+          AttentionType      attention_type   = AttentionType::UNFUSED_MHA);
+
+    LLaMA(size_t             head_num,
+          size_t             size_per_head,
+          size_t             inter_size,
+          size_t             num_layer,
+          size_t             vocab_size,
+          size_t             rotary_embedding_dim,
+          unsigned long long random_seed,
+          NcclParam          tensor_para,
+          NcclParam          pipeline_para,
+          cudaStream_t       stream,
+          cublasMMWrapper*   cublas_wrapper,
+          IAllocator*        allocator,
+          bool               is_free_buffer_after_forward,
+          cudaDeviceProp*    cuda_device_prop = nullptr,
+          AttentionType      attention_type   = AttentionType::UNFUSED_MHA);
 
     LLaMA(LLaMA<T> const& LLaMA);
 
@@ -143,9 +123,6 @@ class LLaMA: public BaseLayer {
     size_t getTensorParallelRank();
     size_t getTensorParallelSize();
     bool*  getFinishBuffer();
-
-    void registerCallback(callback_sig* fn, void* ctx);
-    void unRegisterCallback();
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 49af917de..06541af4b 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -32,14 +32,11 @@ void LLaMAContextDecoder<T>::initialize()
                                                               size_per_head_,
                                                               head_num_,
                                                               rotary_embedding_dim_,
-                                                              neox_rotary_style_,
                                                               stream_,
                                                               cublas_wrapper_,
                                                               allocator_,
                                                               is_free_buffer_after_forward_,
-                                                              is_qk_buf_float_,
-                                                              false,
-                                                              0);
+                                                              is_qk_buf_float_);
 
     ffn_layer_ = new SiluFfnLayer<T>(0,  // max_batch_size
                                      0,  // max_seq_len
@@ -130,7 +127,6 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t
                                             size_t                              inter_size,
                                             size_t                              num_layer,
                                             size_t                              rotary_embedding_dim,
-                                            bool                                neox_rotary_style,
                                             float                               layernorm_eps,
                                             NcclParam                           pipeline_para,
                                             cudaStream_t                        stream,
@@ -138,23 +134,18 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t
                                             IAllocator*                         allocator,
                                             bool                                is_free_buffer_after_forward,
                                             bool                                is_qk_buf_float,
-                                            AttentionType                       attention_type,
-                                            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                            int                                 enable_custom_all_reduce):
+                                            AttentionType                       attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
     head_num_(head_num),
     size_per_head_(size_per_head),
     inter_size_(inter_size),
     num_layer_(num_layer),
     rotary_embedding_dim_(rotary_embedding_dim),
-    neox_rotary_style_(neox_rotary_style),
     layernorm_eps_(layernorm_eps),
     hidden_units_(head_num * size_per_head),
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
-    attention_type_(attention_type),
-    custom_all_reduce_comm_(custom_all_reduce_comm),
-    enable_custom_all_reduce_(enable_custom_all_reduce)
+    attention_type_(attention_type)
 {
     initialize();
 }
@@ -167,14 +158,11 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
     inter_size_(decoder.inter_size_),
     num_layer_(decoder.num_layer_),
     rotary_embedding_dim_(decoder.rotary_embedding_dim_),
-    neox_rotary_style_(decoder.neox_rotary_style_),
     layernorm_eps_(decoder.layernorm_eps_),
     hidden_units_(decoder.hidden_units_),
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
-    attention_type_(decoder.attention_type_),
-    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
-    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
+    attention_type_(decoder.attention_type_)
 {
     initialize();
 }
@@ -253,6 +241,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     AttentionType attention_type  = attention_type_;
     const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
 
+
     size_t h_token_num = batch_size * seq_len;
     if (is_unpadded_mha) {
         const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index 115b3b06b..452567208 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -41,7 +41,6 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t inter_size_;
     size_t num_layer_;
     size_t rotary_embedding_dim_;
-    bool   neox_rotary_style_;
     float  layernorm_eps_;
 
     // calculated data
@@ -49,9 +48,6 @@ class LLaMAContextDecoder: public BaseLayer {
 
     NcclParam pipeline_para_;
 
-    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
-    int                                 enable_custom_all_reduce_;
-
     AttentionType attention_type_;
 
     bool is_qk_buf_float_;
@@ -85,7 +81,6 @@ class LLaMAContextDecoder: public BaseLayer {
                         size_t                              inter_size,
                         size_t                              num_layer,
                         size_t                              rotary_embedding_dim,
-                        bool                                neox_rotary_style,
                         float                               layernorm_eps,
                         NcclParam                           pipeline_para,
                         cudaStream_t                        stream,
@@ -93,9 +88,7 @@ class LLaMAContextDecoder: public BaseLayer {
                         IAllocator*                         allocator,
                         bool                                is_free_buffer_after_forward,
                         bool                                is_qk_buf_float,
-                        AttentionType                       attention_type            = AttentionType::FUSED_MHA,
-                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
-                        int                                 enable_custom_all_reduce_ = 0);
+                        AttentionType                       attention_type            = AttentionType::FUSED_MHA);
 
     LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
 

From 321bc736ad9086d41b7191bf85ff6d5f1b728980 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 18 Sep 2023 15:58:57 +0000
Subject: [PATCH 19/55] buf fix

---
 .../layers/attention_layers/LLaMAContextAttentionLayer.cc       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index bc917df72..f22fa3032 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -399,6 +399,7 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
     }
     else {
         allocator_->free((void**)(&qk_buf_));
+        qk_buf_ = nullptr;
     }
     qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
     qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
@@ -410,6 +411,7 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
+            qk_buf_float_ = nullptr;
         }
     }
 

From 837e9d7ab3a801cafa7ac0f570ba4f211e368dac Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 18 Sep 2023 16:22:09 +0000
Subject: [PATCH 20/55] dump

---
 examples/cpp/llama/llama_example.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index d2c8dcf51..721f4aef5 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -269,6 +269,7 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
 
+    /*
     if (rank == world_size - 1) {
         T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
         cudaMemcpy(out,
@@ -289,6 +290,7 @@ void llama_example(const INIReader reader)
         std::cout << "\n";
         free(out);
     }
+        */
 
     // test time
     cudaProfilerStart();
@@ -301,7 +303,7 @@ void llama_example(const INIReader reader)
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
     // warm up
-    ite = 3;
+    ite = 10;
     for (int i = 0; i < ite; ++i) {
         llama.forward(&output_tensors, &input_tensors, &llama_weights);
     }

From 56c33256a96c61acfaa06fadedd0b40f8b60ea82 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 18 Sep 2023 17:09:55 +0000
Subject: [PATCH 21/55] add gemm_cofing.in

---
 examples/cpp/llama/gemm_config.in | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 examples/cpp/llama/gemm_config.in

diff --git a/examples/cpp/llama/gemm_config.in b/examples/cpp/llama/gemm_config.in
new file mode 100644
index 000000000..8a93b9027
--- /dev/null
+++ b/examples/cpp/llama/gemm_config.in
@@ -0,0 +1,12 @@
+batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time
+32 256 52 128 1 ### 1 19968 8192 6656 21 0 24 1 0 0 0 0 20.595835
+32 256 52 128 1 ### 1664 256 256 128 103 -1 -1 -1 -1 -1 -1 -1 0.929050
+32 256 52 128 1 ### 1664 128 256 256 103 -1 -1 -1 -1 -1 -1 -1 0.661050
+32 256 52 128 1 ### 1 6656 8192 6656 21 0 24 1 0 0 0 0 6.882683
+32 256 52 128 1 ### 1 17920 8192 6656 21 0 24 1 0 0 0 0 18.293156
+32 256 52 128 1 ### 1 6656 8192 17920 21 0 24 2 0 1 6656 0 18.400911
+32 1 52 128 1 ### 1 19968 32 6656 3 0 21 1 1 0 0 0 0.328397
+32 1 52 128 1 ### 1 6656 32 6656 21 0 15 6 0 1 416 0 0.131215
+32 1 52 128 1 ### 1 17920 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.306050
+32 1 52 128 1 ### 1 6656 32 17920 21 0 15 6 0 1 416 0 0.312504
+32 1 52 128 1 ### 1 32000 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.753770

From 4a0a9d708ea1780f47e711ffd89d2ec286c12c05 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 19 Sep 2023 03:54:41 +0000
Subject: [PATCH 22/55] remove backup file trace

---
 examples/cpp/llama/backup.csv | 32 --------------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 examples/cpp/llama/backup.csv

diff --git a/examples/cpp/llama/backup.csv b/examples/cpp/llama/backup.csv
deleted file mode 100644
index eb28ed345..000000000
--- a/examples/cpp/llama/backup.csv
+++ /dev/null
@@ -1,32 +0,0 @@
-1, 14542,  3262,  8112, 29901,  7803,  1757,   526, 13407,   297,   263, 13569, 29889,  2688
-1,  7392,  1026, 29901,   319, 11379, 15028,   297,   263, 17948,  8693, 29889,   450, 11379
-1,  5057, 12500, 29901,   319,  8023,   338,  2734,  1623,   263,  5702, 29889,   450,  8023
-1, 17984, 18558, 29901,  1334,  1074,   263,   767, 13407,   297,   263,  5716, 29889,   450, 767
-1, 19509,   263,  1766, 25206, 29901, 11647,   526,  2734,   373,   278, 11952, 29889,   319, 767
-1,  7412,   292, 11565, 29901, 11647,   526, 16246,  5742,  6131, 13587,  3949, 18464, 29889, 11647
-1,  7412,   292, 10311, 26082, 29901,   319,   767,   338, 16246,  2768,   263,  5716, 29889, 940
-1,   323,  4524, 29901,   319,   767,   322,  6114,   526, 16246,   373,   263,  7408,  4208, 29889, 2688
-1,  8565,   375,  3183, 29901,   319,   767,   338, 13407,   701,   322,  8026,   263,  7679, 29889, 11647
-1, 12878,   292,   278, 11203, 29901,   319,  6114, 17042,  2039,   714, 11480,   278, 17455, 29889,  7803,  2319, 26361
-1,  5057, 12500, 29901,   319,  6114,   338, 22049,  3412,   263,  5702, 29889, 2296
-1,  8565, 11203, 29901,   319,  2919, 19174,   338, 22229,  2820,   263,  1746, 29889, 11647
-1,  8360,  5367, 29901,   319,  6114,   338,   409,   630,   472,   263,  1591, 29889,  2296
-1,   476,   484, 14067, 29901,   319,   767, 17905,  1379,   373,   263, 17132, 29889,   450, 767
-1,  7412,   292,   378, 25496, 29901,   319,   767,   338, 16246,  5742,  1023, 28987, 29889, 940
-1,  1706,   262,  1076, 29901,   319,   767,   338, 16246,   373,   385, 15058,  4768,   446, 29889, 940
-1,   390,  5086, 11308, 29901,   319,   767,   338,  1153,  9292, 11308,   297,   263, 29413, 29889, 940
-1,  7412,   292, 11210,   336, 29901,   319,   767,   715, 16926,   263, 21387,   964,   670, 11210, 29889, 940
-1,  7412,   292, 11210,   336, 29901,   319,  4123,   767,   269,  1169,   373,   263,  6592, 29889,   450, 767
-1, 28551,   292, 29901, 11647,   526, 13407,   373,   263, 17306,   310, 15007, 29889, 11647
-1,  8481, 24613,  1847, 29901,   319,   767,   338, 13407,   373,   263, 19587, 11952, 29889, 940
-1,  7412,   292,  1248, 29877, 29901, 11647,   526,  2381, 25217,   297,   278,  4094, 29889,  7803, 5866
-1,  5057, 12500, 29901,   319,   767, 15028,   297,   278,  7256,   310,   263, 10728,  1974, 29889, 29445, 6289
-1,  5057, 12500, 29901,   319,   767,   338,  4318,  2734,  1623,   263,  5702, 29889,   940
-1,  6781,  8522, 29901, 11647,   526, 13407,   373,   263,  1746,  9963, 29889,   450,  1757
-1,  8565, 11203, 29901,   319,  6114,   338,  8743,   411,   263, 11203, 29889,   450, 11203
-1,  3925, 25217, 29901,   319,  2381, 25217, 11565,   338,  4318,   297,   263,  5716, 29889, 7567
-1,  6163, 23131,   292, 29901, 12753,  2305,   748, 23131,   292,  1623,   263, 10952, 29889, 2688
-1,   399,   336,  3262, 22981, 29901,  1334,  1074,   263,  3800,   373, 18187, 29889,   319, 2022
-1, 28551,   292, 29901,   319,  2022,   338, 14993,   292,  1623,   263, 17306,   310, 15007, 29889, 2688
-1,   399,  1161,   292,  3700, 29901,   319,  6114,   338, 13407,   297,   263,  5716,  9963, 29889, 2296
-1,  2522, 11495,  1933,   292, 29901,   319,   767,   338,  1090,  4094,   297,   263, 11565, 29889, 940

From e63b85b57bc4e1d49172b323a2e658e72c8e2f2d Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 19 Sep 2023 03:56:16 +0000
Subject: [PATCH 23/55] remove gemm_config.in

---
 examples/cpp/llama/gemm_config.in | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 examples/cpp/llama/gemm_config.in

diff --git a/examples/cpp/llama/gemm_config.in b/examples/cpp/llama/gemm_config.in
deleted file mode 100644
index 8a93b9027..000000000
--- a/examples/cpp/llama/gemm_config.in
+++ /dev/null
@@ -1,12 +0,0 @@
-batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time
-32 256 52 128 1 ### 1 19968 8192 6656 21 0 24 1 0 0 0 0 20.595835
-32 256 52 128 1 ### 1664 256 256 128 103 -1 -1 -1 -1 -1 -1 -1 0.929050
-32 256 52 128 1 ### 1664 128 256 256 103 -1 -1 -1 -1 -1 -1 -1 0.661050
-32 256 52 128 1 ### 1 6656 8192 6656 21 0 24 1 0 0 0 0 6.882683
-32 256 52 128 1 ### 1 17920 8192 6656 21 0 24 1 0 0 0 0 18.293156
-32 256 52 128 1 ### 1 6656 8192 17920 21 0 24 2 0 1 6656 0 18.400911
-32 1 52 128 1 ### 1 19968 32 6656 3 0 21 1 1 0 0 0 0.328397
-32 1 52 128 1 ### 1 6656 32 6656 21 0 15 6 0 1 416 0 0.131215
-32 1 52 128 1 ### 1 17920 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.306050
-32 1 52 128 1 ### 1 6656 32 17920 21 0 15 6 0 1 416 0 0.312504
-32 1 52 128 1 ### 1 32000 32 6656 99 -1 -1 -1 -1 -1 -1 -1 0.753770

From 3a103088af353c2881e7be3fda3a9e00dc4dc4d4 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Wed, 20 Sep 2023 07:05:56 +0000
Subject: [PATCH 24/55] dumdump

---
 examples/cpp/llama/llama_example.cc           |  2 -
 .../kernels/bert_preprocess_kernels.cu        |  2 +-
 src/fastertransformer/models/llama/LLaMA.cc   | 95 ++++++++++---------
 .../models/llama/LLaMAContextDecoder.cc       | 27 +++---
 4 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 721f4aef5..2359cf022 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -269,7 +269,6 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
 
-    /*
     if (rank == world_size - 1) {
         T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
         cudaMemcpy(out,
@@ -290,7 +289,6 @@ void llama_example(const INIReader reader)
         std::cout << "\n";
         free(out);
     }
-        */
 
     // test time
     cudaProfilerStart();
diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
index a57161c85..8179c3368 100644
--- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
@@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M
 
 #endif
 
-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index e0d8d1c99..7f9faf463 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -58,7 +58,8 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_
         input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false));
     decoder_output_buf_ =
         (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
-    //logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len * vocab_size_, false));
+    // logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len *
+    // vocab_size_, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
     value_cache_ = key_cache_ + self_cache_size;
@@ -81,7 +82,7 @@ void LLaMA<T>::freeBuffer()
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
         allocator_->free((void**)(&decoder_output_buf_));
-        //allocator_->free((void**)(&logits_buf_));
+        // allocator_->free((void**)(&logits_buf_));
 
         allocator_->free((void**)(&key_cache_));
         if (cache_indirections_[0] != nullptr) {
@@ -99,19 +100,19 @@ void LLaMA<T>::freeBuffer()
 }
 
 template<typename T>
-LLaMA<T>::LLaMA(size_t                              head_num,
-                size_t                              size_per_head,
-                size_t                              inter_size,
-                size_t                              num_layer,
-                size_t                              vocab_size,
-                size_t                              rotary_embedding_dim,
-                unsigned long long                  random_seed,
-                cudaStream_t                        stream,
-                cublasMMWrapper*                    cublas_wrapper,
-                IAllocator*                         allocator,
-                bool                                is_free_buffer_after_forward,
-                cudaDeviceProp*                     cuda_device_prop,
-                AttentionType                       attention_type):
+LLaMA<T>::LLaMA(size_t             head_num,
+                size_t             size_per_head,
+                size_t             inter_size,
+                size_t             num_layer,
+                size_t             vocab_size,
+                size_t             rotary_embedding_dim,
+                unsigned long long random_seed,
+                cudaStream_t       stream,
+                cublasMMWrapper*   cublas_wrapper,
+                IAllocator*        allocator,
+                bool               is_free_buffer_after_forward,
+                cudaDeviceProp*    cuda_device_prop,
+                AttentionType      attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -128,21 +129,21 @@ LLaMA<T>::LLaMA(size_t                              head_num,
 }
 
 template<typename T>
-LLaMA<T>::LLaMA(size_t                              head_num,
-                size_t                              size_per_head,
-                size_t                              inter_size,
-                size_t                              num_layer,
-                size_t                              vocab_size,
-                size_t                              rotary_embedding_dim,
-                unsigned long long                  random_seed,
-                NcclParam                           tensor_para,
-                NcclParam                           pipeline_para,
-                cudaStream_t                        stream,
-                cublasMMWrapper*                    cublas_wrapper,
-                IAllocator*                         allocator,
-                bool                                is_free_buffer_after_forward,
-                cudaDeviceProp*                     cuda_device_prop,
-                AttentionType                       attention_type):
+LLaMA<T>::LLaMA(size_t             head_num,
+                size_t             size_per_head,
+                size_t             inter_size,
+                size_t             num_layer,
+                size_t             vocab_size,
+                size_t             rotary_embedding_dim,
+                unsigned long long random_seed,
+                NcclParam          tensor_para,
+                NcclParam          pipeline_para,
+                cudaStream_t       stream,
+                cublasMMWrapper*   cublas_wrapper,
+                IAllocator*        allocator,
+                bool               is_free_buffer_after_forward,
+                cudaDeviceProp*    cuda_device_prop,
+                AttentionType      attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -259,24 +260,26 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         stream_);
     sync_check_cuda_error();
 
-    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                                             nullptr,
-                                             llama_weights->pre_decoder_embedding_table,
-                                             llama_weights->position_encoding_table,
-                                             pPromptTuningParam<T>{},  // no p/prompt tuning
-                                             tiled_input_ids_buf_,
-                                             1,
-                                             max_input_length,
-                                             max_input_length,
-                                             batch_size,
-                                             hidden_units_,
-                                             stream_);
-    sync_check_cuda_error();
-
     invokeBuildDecoderAttentionMask(
         input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_);
     sync_check_cuda_error();
 
+    if (pipeline_para_.rank_ == 0) {
+        invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
+                                                 nullptr,
+                                                 llama_weights->pre_decoder_embedding_table,
+                                                 llama_weights->position_encoding_table,
+                                                 pPromptTuningParam<T>{},  // no p/prompt tuning
+                                                 tiled_input_ids_buf_,
+                                                 1,
+                                                 max_input_length,
+                                                 max_input_length,
+                                                 batch_size,
+                                                 hidden_units_,
+                                                 stream_);
+        sync_check_cuda_error();
+    }
+
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
          Tensor{
@@ -314,7 +317,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
 
         // FIXME: debugging
-        T *output_logits =  output_tensors->at("output_logits").getPtr<T>();
+        T* output_logits = output_tensors->at("output_logits").getPtr<T>();
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
                               vocab_size_,
@@ -325,7 +328,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                               context_decoder_input_buf_,
                               hidden_units_,  // n
                               output_logits,
-                              //logits_buf_,
+                              // logits_buf_,
                               vocab_size_);
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 06541af4b..08980923a 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -122,19 +122,19 @@ int LLaMAContextDecoder<T>::getFirstLayerParallelId()
 }
 
 template<typename T>
-LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t                              head_num,
-                                            size_t                              size_per_head,
-                                            size_t                              inter_size,
-                                            size_t                              num_layer,
-                                            size_t                              rotary_embedding_dim,
-                                            float                               layernorm_eps,
-                                            NcclParam                           pipeline_para,
-                                            cudaStream_t                        stream,
-                                            cublasMMWrapper*                    cublas_wrapper,
-                                            IAllocator*                         allocator,
-                                            bool                                is_free_buffer_after_forward,
-                                            bool                                is_qk_buf_float,
-                                            AttentionType                       attention_type):
+LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           inter_size,
+                                            size_t           num_layer,
+                                            size_t           rotary_embedding_dim,
+                                            float            layernorm_eps,
+                                            NcclParam        pipeline_para,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward,
+                                            bool             is_qk_buf_float,
+                                            AttentionType    attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -241,7 +241,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     AttentionType attention_type  = attention_type_;
     const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
 
-
     size_t h_token_num = batch_size * seq_len;
     if (is_unpadded_mha) {
         const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();

From be298831407d766df29d8c173ffad2bf99e56499 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 22 Sep 2023 18:36:48 +0000
Subject: [PATCH 25/55] test done

---
 examples/cpp/llama/llama_config.ini           |   4 +-
 examples/cpp/llama/llama_example.cc           |  33 ++-
 .../LLaMAContextAttentionLayer.cc             |  70 +++---
 .../LLaMAContextAttentionLayer.h              |  40 ++--
 src/fastertransformer/models/llama/LLaMA.cc   | 207 +++++++----------
 src/fastertransformer/models/llama/LLaMA.h    |  67 +++---
 .../models/llama/LLaMAContextDecoder.cc       |  47 ++--
 src/fastertransformer/th_op/llama/LLaMA.cc    | 106 +++------
 src/fastertransformer/th_op/llama/LLaMA.h     | 216 ++++++------------
 src/fastertransformer/utils/memory_utils.cu   |   2 +-
 10 files changed, 295 insertions(+), 497 deletions(-)

diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 931b24e5d..3df66269f 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -7,6 +7,7 @@ pipeline_para_size=4
 
 [request]
 request_batch_size=32
+start_pos=2
 
 [llama_33B]
 head_num=52
@@ -15,5 +16,6 @@ vocab_size=32000
 decoder_layers=60
 rotary_embedding=128
 multiple_of=256
-max_cache_seq_len=1024
+max_seq_len=1024
 padding_id=0
+random_seed=0
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 2359cf022..3065d4873 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,14 +81,15 @@ void llama_example(const INIReader reader)
     const size_t decoder_layers       = reader.GetInteger(model_name, "decoder_layers");
     const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
     const int    multiple_of          = reader.GetInteger(model_name, "multiple_of");
-    const size_t max_cache_seq_len    = reader.GetInteger(model_name, "max_cache_seq_len");
+    const size_t max_seq_len          = reader.GetInteger(model_name, "max_seq_len");
 
     const size_t hidden_units = head_num * size_per_head;
     const size_t inter_size   = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of);
 
     const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
-    const int    min_length         = reader.GetInteger("request", "min_length", 0);
     const int    padding_id         = reader.GetInteger(model_name, "padding_id");
+    int          start_pos          = reader.GetInteger("request", "start_pos", 0);
+    unsigned long long random_seed = reader.GetInteger("request", "random_seed", 0);
 
     FT_CHECK(decoder_layers % pipeline_para_size == 0);
 
@@ -181,10 +182,7 @@ void llama_example(const INIReader reader)
 
     model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
     llama_weights.loadModel(model_dir);
-    unsigned long long random_seed;
-    if (rank == 0) {
-        random_seed = (unsigned long long)(0);
-    }
+
     if (world_size > 1) {
         mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD);
     }
@@ -193,7 +191,7 @@ void llama_example(const INIReader reader)
         getAttentionType<T>(size_per_head,
                             getSMVersion(),
                             !((std::getenv("SHONG_PADDING") != nullptr)
-                             && (std::string(std::getenv("SHONG_PADDING")) == "ON")), //true,  // remove_padding
+                              && (std::string(std::getenv("SHONG_PADDING")) == "ON")),  // true,  // remove_padding
                             0,      // llama supports any-seq-length fmha
                             true,   // is_fuse
                             false,  // with_relative_position_bias
@@ -221,6 +219,7 @@ void llama_example(const INIReader reader)
                               vocab_size,
                               rotary_embedding_dim,
                               random_seed,
+                              max_seq_len,
                               tensor_para,
                               pipeline_para,
                               stream,
@@ -230,23 +229,18 @@ void llama_example(const INIReader reader)
                               &prop,
                               attention_type);
 
-    T* d_output_logits;
+    float* d_output_logits;
     deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false);
-    std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
          Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, (size_t)max_input_len}, d_input_ids}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_input_lengths}},
-        {"output_seq_len",
-         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
-        {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
-        {"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}},
-        {"max_cache_seq_len", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &max_cache_seq_len}}};
+        {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &start_pos}}};
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
         {"output_logits",
          Tensor{MEMORY_GPU,
-                TYPE_FP16,
+                TYPE_FP32,
                 std::vector<size_t>{request_batch_size, (size_t)total_output_len, vocab_size},
                 d_output_logits}}};
 
@@ -269,12 +263,14 @@ void llama_example(const INIReader reader)
     POP_RANGE;
     ft_nvtx::resetScope();
 
+    /*
     if (rank == world_size - 1) {
-        T* out = (T*)malloc(sizeof(T) * request_batch_size * total_output_len * vocab_size);
+        float* out = (float*)malloc(sizeof(float) * request_batch_size * total_output_len * vocab_size);
         cudaMemcpy(out,
                    d_output_logits,
-                   sizeof(T) * request_batch_size * total_output_len * vocab_size,
-                   cudaMemcpyDeviceToHost);
+                   sizeof(float) * request_batch_size * total_output_len * vocab_size,
+                   cudaMemcpyDeviceToHost
+                   );
         for (int b = 0; b < request_batch_size; ++b) {
             std::cout << "[";
             for (int s = 0; s < total_output_len; ++s) {
@@ -289,6 +285,7 @@ void llama_example(const INIReader reader)
         std::cout << "\n";
         free(out);
     }
+    */
 
     // test time
     cudaProfilerStart();
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index f22fa3032..f0dfce8c7 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -43,11 +43,12 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 5);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
-    const int  request_batch_size = input_tensors->at("attention_mask").shape[0];
-    const int  request_seq_len    = input_tensors->at("attention_mask").shape[2];
-    const int  layer_id           = input_tensors->getVal<int>("layer_id");
-    const int* padding_offset     = input_tensors->getPtr<int>("padding_offset", nullptr);
-    int*       cu_seqlens         = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const int  batch_size     = input_tensors->at("attention_mask").shape[0];
+    const int  seq_len        = input_tensors->at("attention_mask").shape[2];
+    const int  layer_id       = input_tensors->getVal<int>("layer_id");
+    const int* padding_offset = input_tensors->getPtr<int>("padding_offset", nullptr);
+    int*       cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    size_t     start_pos      = input_tensors->at("start_pos").max<uint32_t>();
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -58,11 +59,11 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                        "LLaMA Context FUSED_PADDED_MHA is not supported !");
 
     PUSH_RANGE("attention buffer alloc");
-    allocateBuffer(request_batch_size, request_seq_len, attention_type != AttentionType::FUSED_MHA);
+    allocateBuffer(batch_size, seq_len, attention_type != AttentionType::FUSED_MHA);
     POP_RANGE;
     sync_check_cuda_error();
 
-    const int  m        = input_tensors->at("input_query").shape[0];
+    const int m = input_tensors->at("input_query").shape[0];
 
     PUSH_RANGE("qkv_gemm");
 
@@ -83,7 +84,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(q_buf_2_, 0, request_batch_size * request_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+        cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
     }
     invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                    k_buf_2_,
@@ -92,8 +93,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    qkv_buf_,
                                    attention_weights->query_weight.bias,
                                    padding_offset,
-                                   request_batch_size,
-                                   request_seq_len,
+                                   batch_size,
+                                   seq_len,
                                    m,
                                    head_num_,
                                    size_per_head_,
@@ -108,12 +109,17 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     // Use batch major
     // put k/v_buf from shape [B, H, L, Dh]
     // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
+    // TODO: Cache implementation
+    // k_cache: [batch_size, num_heads, L, Dh]
+    // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh]
+    // v_buf: [batch_size, num_heads, L, Dh]
+
     invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
                                 output_tensors->getPtr<T>("value_cache"),
                                 k_buf_2_,
                                 v_buf_2_,
-                                request_batch_size,
-                                request_seq_len,
+                                batch_size,
+                                seq_len,
                                 max_seq_len,
                                 size_per_head_,
                                 head_num_,
@@ -122,19 +128,16 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     // k_cache = (batch_size, num_heads, Dh/x, L, x)
     // v_cache = (batch_size, num_heads, L, Dh)
     sync_check_cuda_error();
-
-    // NOTE: qkv buffer shape (batch_size, num_heads,L or prompt_len + L, Dh)
-
     POP_RANGE;
 
     if (attention_type == AttentionType::FUSED_MHA) {
-        dispatcher_fp16->setup_causal_masked_fmha(request_seq_len, request_batch_size);
+        dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
         dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
     }
     else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = request_seq_len;  // q length
-        const int            attention_seq_len_2 = request_seq_len;  // kv length
+        const int            attention_seq_len_1 = seq_len;              // q length
+        const int            attention_seq_len_2 = start_pos + seq_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
 
         //
@@ -161,7 +164,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                 CUDA_R_32F,
                                                 attention_seq_len_2,  // n
                                                 attention_seq_len_2 * attention_seq_len_1,
-                                                request_batch_size * head_num_,  // global batch size
+                                                batch_size * head_num_,  // global batch size
                                                 CUDA_R_32F);
 
             sync_check_cuda_error();
@@ -172,7 +175,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
             param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
             param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-            param.batch_size         = request_batch_size;
+            param.batch_size         = batch_size;
             param.q_length           = attention_seq_len_1;
             param.k_length           = attention_seq_len_2;
             param.num_heads          = head_num_;
@@ -198,7 +201,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                 qk_buf_,
                                                 attention_seq_len_2,
                                                 attention_seq_len_2 * attention_seq_len_1,
-                                                request_batch_size * head_num_);
+                                                batch_size * head_num_);
 
             POP_RANGE;
             PUSH_RANGE("softmax");
@@ -206,7 +209,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
             param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
             param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-            param.batch_size         = request_batch_size;
+            param.batch_size         = batch_size;
             param.q_length           = attention_seq_len_1;
             param.k_length           = attention_seq_len_2;
             param.num_heads          = head_num_;
@@ -232,13 +235,13 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                             qkv_buf_2_,
                                             size_per_head_,
                                             attention_seq_len_1 * size_per_head_,
-                                            request_batch_size * head_num_);
+                                            batch_size * head_num_);
 
         // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
         if (padding_offset == nullptr) {
             invokeTransposeQKV(qkv_buf_3_,
                                qkv_buf_2_,
-                               request_batch_size,
+                               batch_size,
                                attention_seq_len_1,
                                head_num_,
                                size_per_head_,
@@ -251,7 +254,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
                                                      qkv_buf_3_,
                                                      m,
-                                                     request_batch_size,
+                                                     batch_size,
                                                      attention_seq_len_1,
                                                      head_num_,
                                                      size_per_head_,
@@ -286,8 +289,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
-                                                          size_t           max_seq_len,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
                                                           size_t           head_num,
                                                           size_t           size_per_head,
                                                           cudaStream_t     stream,
@@ -296,8 +298,6 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           bool             is_free_buffer_after_forward,
                                                           bool             is_qk_buf_float):
     BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
-    max_batch_size_(max_batch_size),
-    max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
@@ -307,8 +307,7 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
-                                                          size_t           max_seq_len,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
                                                           size_t           head_num,
                                                           size_t           size_per_head,
                                                           size_t           local_head_num,
@@ -318,8 +317,6 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           bool             is_free_buffer_after_forward,
                                                           bool             is_qk_buf_float):
     BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
-    max_batch_size_(max_batch_size),
-    max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
@@ -331,8 +328,7 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_batch_size,
-                                                          size_t           max_seq_len,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
                                                           size_t           head_num,
                                                           size_t           size_per_head,
                                                           size_t           local_head_num,
@@ -343,8 +339,6 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           max_b
                                                           bool             is_free_buffer_after_forward,
                                                           bool             is_qk_buf_float):
     BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
-    max_batch_size_(max_batch_size),
-    max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
@@ -361,8 +355,6 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionL
                           attention_layer.cublas_wrapper_,
                           attention_layer.allocator_,
                           attention_layer.is_free_buffer_after_forward_),
-    max_batch_size_(attention_layer.max_batch_size_),
-    max_seq_len_(attention_layer.max_seq_len_),
     head_num_(attention_layer.head_num_),
     size_per_head_(attention_layer.size_per_head_),
     hidden_units_(attention_layer.hidden_units_),
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 635d3d15a..85fd74af8 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -27,15 +27,11 @@ namespace fastertransformer {
 template<typename T>
 class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
 private:
-    // buffer handling
-    size_t max_batch_size_ = 0;
-    size_t max_seq_len_    = 0;
-
     // metadata
-    const size_t           head_num_;
-    const size_t           size_per_head_;
-    const size_t           hidden_units_;
-    const size_t           rotary_embedding_dim_;
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t rotary_embedding_dim_;
 
     // fmha runner
     int                        sm_ = getSMVersion();
@@ -54,19 +50,17 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
 protected:
     using BaseAttentionLayer<T>::allocator_;
     using BaseAttentionLayer<T>::stream_;
-    T*     qkv_buf_              = nullptr;
-    T*     q_buf_2_              = nullptr;
-    T*     k_buf_2_              = nullptr;
-    T*     v_buf_2_              = nullptr;
-    T*     qk_buf_               = nullptr;
-    float* qk_buf_float_         = nullptr;
-    T*     qkv_buf_2_            = nullptr;
-    T*     qkv_buf_3_            = nullptr;
+    T*     qkv_buf_      = nullptr;
+    T*     q_buf_2_      = nullptr;
+    T*     k_buf_2_      = nullptr;
+    T*     v_buf_2_      = nullptr;
+    T*     qk_buf_       = nullptr;
+    float* qk_buf_float_ = nullptr;
+    T*     qkv_buf_2_    = nullptr;
+    T*     qkv_buf_3_    = nullptr;
 
 public:
-    LLaMAContextAttentionLayer(size_t           max_batch_size,
-                               size_t           max_seq_len,
-                               size_t           head_num,
+    LLaMAContextAttentionLayer(size_t           head_num,
                                size_t           size_per_head,
                                cudaStream_t     stream,
                                cublasMMWrapper* cublas_wrapper,
@@ -74,9 +68,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                bool             is_free_buffer_after_forward,
                                bool             is_qk_buf_float);
 
-    LLaMAContextAttentionLayer(size_t           max_batch_size,
-                               size_t           max_seq_len,
-                               size_t           head_num,
+    LLaMAContextAttentionLayer(size_t           head_num,
                                size_t           size_per_head,
                                size_t           local_head_num,
                                cudaStream_t     stream,
@@ -85,9 +77,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                bool             is_free_buffer_after_forward,
                                bool             is_qk_buf_float);
 
-    LLaMAContextAttentionLayer(size_t           max_batch_size,
-                               size_t           max_seq_len,
-                               size_t           head_num,
+    LLaMAContextAttentionLayer(size_t           head_num,
                                size_t           size_per_head,
                                size_t           local_head_num,
                                size_t           rotary_embedding_dim,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 7f9faf463..e4c4e4ee8 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -19,7 +19,9 @@
 #include "src/fastertransformer/kernels/decoding_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include "src/fastertransformer/utils/memory_utils.h"
 #include <algorithm>
+#include <type_traits>
 
 namespace fastertransformer {
 
@@ -48,30 +50,30 @@ void LLaMA<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMA<T>::allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
+void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t self_cache_size =
-        (num_layer_ / pipeline_para_.world_size_) * batch_size * max_cache_seq_len * hidden_units_;
+    const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_;
 
-    input_attention_mask_ = (T*)(allocator_->reMalloc(
-        input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_cache_seq_len, false));
+    input_attention_mask_ =
+        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * max_seq_len, false));
     decoder_output_buf_ =
         (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
-    // logits_buf_       = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * max_seq_len *
-    // vocab_size_, false));
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
+    logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
     value_cache_ = key_cache_ + self_cache_size;
 
     tiled_input_ids_buf_ =
-        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * max_input_len, false));
+        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * seq_len, false));
     tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false));
 
-    context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
-        context_decoder_input_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
+    context_decoder_input_buf_ =
+        (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
-        context_decoder_output_buf_, sizeof(T) * batch_size * max_input_len * hidden_units_, false));
+        context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
     is_allocate_buffer_ = true;
 }
@@ -82,7 +84,7 @@ void LLaMA<T>::freeBuffer()
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
         allocator_->free((void**)(&decoder_output_buf_));
-        // allocator_->free((void**)(&logits_buf_));
+        allocator_->free((void**)(&logits_buf_));
 
         allocator_->free((void**)(&key_cache_));
         if (cache_indirections_[0] != nullptr) {
@@ -100,19 +102,20 @@ void LLaMA<T>::freeBuffer()
 }
 
 template<typename T>
-LLaMA<T>::LLaMA(size_t             head_num,
-                size_t             size_per_head,
-                size_t             inter_size,
-                size_t             num_layer,
-                size_t             vocab_size,
-                size_t             rotary_embedding_dim,
-                unsigned long long random_seed,
-                cudaStream_t       stream,
-                cublasMMWrapper*   cublas_wrapper,
-                IAllocator*        allocator,
-                bool               is_free_buffer_after_forward,
-                cudaDeviceProp*    cuda_device_prop,
-                AttentionType      attention_type):
+LLaMA<T>::LLaMA(size_t           head_num,
+                size_t           size_per_head,
+                size_t           inter_size,
+                size_t           num_layer,
+                size_t           vocab_size,
+                size_t           rotary_embedding_dim,
+                size_t           random_seed,
+                size_t           max_seq_len,
+                cudaStream_t     stream,
+                cublasMMWrapper* cublas_wrapper,
+                IAllocator*      allocator,
+                bool             is_free_buffer_after_forward,
+                cudaDeviceProp*  cuda_device_prop,
+                AttentionType    attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -120,6 +123,8 @@ LLaMA<T>::LLaMA(size_t             head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
+    random_seed_(random_seed),
+    max_seq_len_(max_seq_len),
     hidden_units_(head_num * size_per_head),
     attention_type_(attention_type)
 {
@@ -129,21 +134,22 @@ LLaMA<T>::LLaMA(size_t             head_num,
 }
 
 template<typename T>
-LLaMA<T>::LLaMA(size_t             head_num,
-                size_t             size_per_head,
-                size_t             inter_size,
-                size_t             num_layer,
-                size_t             vocab_size,
-                size_t             rotary_embedding_dim,
-                unsigned long long random_seed,
-                NcclParam          tensor_para,
-                NcclParam          pipeline_para,
-                cudaStream_t       stream,
-                cublasMMWrapper*   cublas_wrapper,
-                IAllocator*        allocator,
-                bool               is_free_buffer_after_forward,
-                cudaDeviceProp*    cuda_device_prop,
-                AttentionType      attention_type):
+LLaMA<T>::LLaMA(size_t           head_num,
+                size_t           size_per_head,
+                size_t           inter_size,
+                size_t           num_layer,
+                size_t           vocab_size,
+                size_t           rotary_embedding_dim,
+                size_t           random_seed,
+                size_t           max_seq_len,
+                NcclParam        tensor_para,
+                NcclParam        pipeline_para,
+                cudaStream_t     stream,
+                cublasMMWrapper* cublas_wrapper,
+                IAllocator*      allocator,
+                bool             is_free_buffer_after_forward,
+                cudaDeviceProp*  cuda_device_prop,
+                AttentionType    attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -151,6 +157,8 @@ LLaMA<T>::LLaMA(size_t             head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
+    random_seed_(random_seed),
+    max_seq_len_(max_seq_len),
     hidden_units_(head_num * size_per_head),
     pipeline_para_(pipeline_para),
     attention_type_(attention_type)
@@ -167,6 +175,8 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     num_layer_(llama.num_layer_),
     vocab_size_(llama.vocab_size_),
     rotary_embedding_dim_(llama.rotary_embedding_dim_),
+    random_seed_(llama.random_seed_),
+    max_seq_len_(llama.max_seq_len_),
     hidden_units_(llama.hidden_units_),
     pipeline_para_(llama.pipeline_para_),
     attention_type_(llama.attention_type_)
@@ -195,49 +205,29 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        const LLaMAWeight<T>*                          llama_weights)
 {
     // input_tensors:
-    //      input_ids [batch_size, max_input_length]
+    //      input_ids [batch_size, seq_len]
     //      input_lengths [batch_size]
-    //      output_seq_len [batch_size] on cpu
-    //      min_length [1] or [batch_size] on cpu, optional, int
-    //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
-    //      max_cache_seq_len [batch_size] on cpu
+    //      start_pos [1] int on cpu
 
     // output_tensors:
-    //      output_logits [batch_size, max_output_seq_len, vocab_size]
+    //      output_logits [batch_size, seq_len, vocab_size]
 
     FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
-    FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end()
-             && input_tensors->at("output_seq_len").shape.size() == 1);
 
     const size_t batch_size = input_tensors->at("input_ids").shape[0];
 
     // NOTE: Prefix Prompt PreProcessing
     // get prefix_prompt_weight for each batch --> shape [batch, 1]
     // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
-    int max_input_length = input_tensors->at("input_ids").shape[1];
+    int seq_len = input_tensors->at("input_ids").shape[1];
 
-    // Prefix Soft Prompt
-    const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>();
-    const size_t max_seq_len        = max_output_seq_len;
     // max cache seq len should include max prefix prompt length as it has k/v states
-    const size_t max_cache_seq_len = input_tensors->at("max_cache_seq_len").max<uint32_t>();
-    if (max_cache_seq_len < max_seq_len) {
-        FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). "
-                       "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.",
-                       max_cache_seq_len,
-                       max_seq_len);
-    }
-    else if (max_cache_seq_len > max_seq_len) {
-        FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). "
-                       "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.",
-                       max_cache_seq_len,
-                       max_seq_len);
-    }
+    const size_t         start_pos      = input_tensors->at("start_pos").max<uint32_t>();
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
 
-    allocateBuffer(batch_size, max_seq_len, max_cache_seq_len, max_input_length);
+    allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
 
     const DataType            data_type          = getTensorType<T>();
@@ -245,10 +235,10 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                                     batch_size,
                                                     head_num_,
                                                     size_per_head_ / (16 / sizeof(T)),
-                                                    max_cache_seq_len,
+                                                    max_seq_len_,
                                                     16 / sizeof(T)};
     const std::vector<size_t> self_v_cache_shape = {
-        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_cache_seq_len, size_per_head_};
+        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_};
 
     invokeTileGptInputs(tiled_input_ids_buf_,
                         tiled_input_lengths_buf_,
@@ -256,12 +246,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         input_tensors->at("input_lengths").getPtr<const int>(),
                         batch_size,
                         1,
-                        max_input_length,
+                        seq_len,
                         stream_);
     sync_check_cuda_error();
 
     invokeBuildDecoderAttentionMask(
-        input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, max_input_length, 0, stream_);
+        input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, seq_len, 0, stream_);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
@@ -272,8 +262,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                                  pPromptTuningParam<T>{},  // no p/prompt tuning
                                                  tiled_input_ids_buf_,
                                                  1,
-                                                 max_input_length,
-                                                 max_input_length,
+                                                 seq_len,
+                                                 seq_len,  // must be same
                                                  batch_size,
                                                  hidden_units_,
                                                  stream_);
@@ -282,94 +272,51 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
-         Tensor{
-             MEMORY_GPU, data_type, {batch_size, (size_t)max_input_length, hidden_units_}, context_decoder_input_buf_}},
+         Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}},
         {"attention_mask",
-         Tensor{MEMORY_GPU,
-                data_type,
-                {batch_size, 1, (size_t)max_input_length, (size_t)(max_input_length)},
-                input_attention_mask_}},
-        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}}};
+         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}},
+        {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &start_pos}}};
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
-         Tensor{MEMORY_GPU,
-                data_type,
-                {batch_size, (size_t)max_input_length, hidden_units_},
-                context_decoder_output_buf_}},
+         Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}},
         {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
-        {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}},
-        {"last_token_hidden_units", Tensor{MEMORY_GPU, data_type, {batch_size, hidden_units_}, decoder_output_buf_}}};
+        {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
 
     llama_context_decoder_->forward(
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
-        invokeGeneralLLaMALayerNorm(context_decoder_input_buf_,
+        invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
                                     llama_weights->post_decoder_layernorm.beta,
                                     layernorm_eps_,
-                                    batch_size * max_input_length,
+                                    batch_size * seq_len,
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
-
-        // FIXME: debugging
-        T* output_logits = output_tensors->at("output_logits").getPtr<T>();
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
                               vocab_size_,
-                              batch_size * max_input_length,
+                              batch_size * seq_len,
                               hidden_units_,
                               llama_weights->post_decoder_embedding.kernel,
                               vocab_size_,
-                              context_decoder_input_buf_,
+                              normed_decoder_output_buf_,
                               hidden_units_,  // n
-                              output_logits,
-                              // logits_buf_,
+                              logits_buf_,
                               vocab_size_);
         sync_check_cuda_error();
-    }
 
-    // sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
-}
-
-template<typename T>
-void LLaMA<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                              const std::unordered_map<std::string, Tensor>* input_tensors)
-{
-    NcclParam tensor_para(0, 1);
-
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (pipeline_para_.world_size_ == 1) {
-        // throw errors when detected
-        ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
-        return;
-    }
-    const auto pp_rank = pipeline_para_.rank_;
-
-    ftNcclGroupStart();
-    for (auto const& it : *output_tensors) {
-        if (it.second.data == nullptr) {
-            continue;
-        }
-
-        if (pp_rank == pipeline_para_.world_size_ - 1) {
-            ftNcclSend(it.second.getPtr<char>(), it.second.sizeBytes(), 0, pipeline_para_, stream_);
-        }
-        else if (pp_rank == 0) {
-            ftNcclRecv(it.second.getPtr<char>(),
-                       it.second.sizeBytes(),
-                       pipeline_para_.world_size_ - 1,
-                       pipeline_para_,
-                       stream_);
+        if (std::is_same<T, half>::value) {
+            float* output_logits = output_tensors->at("output_logits").getPtr<float>();
+            invokeCudaCast<float, T>(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
+            sync_check_cuda_error();
         }
     }
-    ftNcclGroupEnd();
-    // throw errors when detected
-    ftNcclStreamSynchronize(tensor_para, pipeline_para_, stream_);
 }
 
 template<typename T>
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 26d1a6696..dab7a0509 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -35,6 +35,8 @@ class LLaMA: public BaseLayer {
     size_t num_layer_;
     size_t vocab_size_;
     size_t rotary_embedding_dim_;
+    size_t random_seed_;
+    size_t max_seq_len_;
 
     static constexpr float layernorm_eps_ = 1e-6f;
 
@@ -51,7 +53,7 @@ class LLaMA: public BaseLayer {
     LLaMAContextDecoder<T>* llama_context_decoder_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
+    void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len);
     void freeBuffer() override;
 
     void initialize();
@@ -59,8 +61,9 @@ class LLaMA: public BaseLayer {
 protected:
     T* input_attention_mask_;
     T* decoder_output_buf_;
+    T* normed_decoder_output_buf_;
 
-    float* logits_buf_;
+    T* logits_buf_;
 
     T*   key_cache_;
     T*   value_cache_;
@@ -76,35 +79,37 @@ class LLaMA: public BaseLayer {
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
 
 public:
-    LLaMA(size_t             head_num,
-          size_t             size_per_head,
-          size_t             inter_size,
-          size_t             num_layer,
-          size_t             vocab_size,
-          size_t             rotary_embedding_dim,
-          unsigned long long random_seed,
-          cudaStream_t       stream,
-          cublasMMWrapper*   cublas_wrapper,
-          IAllocator*        allocator,
-          bool               is_free_buffer_after_forward,
-          cudaDeviceProp*    cuda_device_prop = nullptr,
-          AttentionType      attention_type   = AttentionType::UNFUSED_MHA);
-
-    LLaMA(size_t             head_num,
-          size_t             size_per_head,
-          size_t             inter_size,
-          size_t             num_layer,
-          size_t             vocab_size,
-          size_t             rotary_embedding_dim,
-          unsigned long long random_seed,
-          NcclParam          tensor_para,
-          NcclParam          pipeline_para,
-          cudaStream_t       stream,
-          cublasMMWrapper*   cublas_wrapper,
-          IAllocator*        allocator,
-          bool               is_free_buffer_after_forward,
-          cudaDeviceProp*    cuda_device_prop = nullptr,
-          AttentionType      attention_type   = AttentionType::UNFUSED_MHA);
+    LLaMA(size_t           head_num,
+          size_t           size_per_head,
+          size_t           inter_size,
+          size_t           num_layer,
+          size_t           vocab_size,
+          size_t           rotary_embedding_dim,
+          size_t           random_seed,
+          size_t           max_seq_len,
+          cudaStream_t     stream,
+          cublasMMWrapper* cublas_wrapper,
+          IAllocator*      allocator,
+          bool             is_free_buffer_after_forward,
+          cudaDeviceProp*  cuda_device_prop = nullptr,
+          AttentionType    attention_type   = AttentionType::UNFUSED_MHA);
+
+    LLaMA(size_t           head_num,
+          size_t           size_per_head,
+          size_t           inter_size,
+          size_t           num_layer,
+          size_t           vocab_size,
+          size_t           rotary_embedding_dim,
+          size_t           random_seed,
+          size_t           max_seq_len,
+          NcclParam        tensor_para,
+          NcclParam        pipeline_para,
+          cudaStream_t     stream,
+          cublasMMWrapper* cublas_wrapper,
+          IAllocator*      allocator,
+          bool             is_free_buffer_after_forward,
+          cudaDeviceProp*  cuda_device_prop = nullptr,
+          AttentionType    attention_type   = AttentionType::UNFUSED_MHA);
 
     LLaMA(LLaMA<T> const& LLaMA);
 
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 08980923a..66fc30b6b 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -26,9 +26,7 @@ namespace fastertransformer {
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
-    self_attention_layer_ = new LLaMAContextAttentionLayer<T>(0,  // max_batch_size
-                                                              0,  // max_seq_len
-                                                              head_num_,
+    self_attention_layer_ = new LLaMAContextAttentionLayer<T>(head_num_,
                                                               size_per_head_,
                                                               head_num_,
                                                               rotary_embedding_dim_,
@@ -182,11 +180,11 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
 {
     std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
                                                               {"attention_mask", input_tensors->at(1)},
-                                                              {"input_lengths", input_tensors->at(2)}};
+                                                              {"input_lengths", input_tensors->at(2)},
+                                                              {"start_pos", input_tensors->at(3)}};
     std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
                                                                {"key_cache", output_tensors->at(1)},
-                                                               {"value_cache", output_tensors->at(2)},
-                                                               {"last_token_hidden_units", output_tensors->at(3)}};
+                                                               {"value_cache", output_tensors->at(2)}};
 
     forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight);
 }
@@ -198,27 +196,26 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 {
     // input tensors:
     //      decoder_input [batch_size, seq_len, hidden_dimension],
-    //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
+    //      attention_mask [batch_size, 1, seq_len, seq_len]
     //      input_lengths [batch_size]
+    //      start_pos [1]
 
     // output tensors:
     //      decoder_output [batch_size, seq_len, hidden_dimension],
-    //      key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
-    //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
-    //      last_token_hidden_units [batch_size, hidden_dimension]
+    //      key_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head]
+    //      value_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head]
 
     // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'.
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 3);
-    FT_CHECK(output_tensors->size() == 4);
+    FT_CHECK(input_tensors->size() == 4);
+    FT_CHECK(output_tensors->size() == 3);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];
-    const int max_prompt_length =
-        input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
-    const DataType data_type = getTensorType<T>();
+    const int      batch_size = input_tensors->at("decoder_input").shape[0];
+    const int      seq_len    = input_tensors->at("decoder_input").shape[1];
+    const int      start_pos  = input_tensors->at("start_pos").max<int>();
+    const DataType data_type  = getTensorType<T>();
     allocateBuffer(batch_size, seq_len);
 
     T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
@@ -243,15 +240,16 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
     size_t h_token_num = batch_size * seq_len;
     if (is_unpadded_mha) {
-        const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+        const int* input_lengths = input_tensors->at("input_lengths").getPtr<int>();
         invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                            &h_token_num,
                                            padding_offset_,
                                            cu_seqlens_,
-                                           base_input_lengths,
+                                           input_lengths,
                                            batch_size,
                                            seq_len,
                                            stream_);
+        sync_check_cuda_error();
     }
 
     for (int l = 0; l < num_layer_; l++) {
@@ -262,9 +260,10 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         if (l == 0 && is_unpadded_mha) {
             invokeRemovePadding(
                 decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
+            sync_check_cuda_error();
         }
 
-        const bool is_final     = false;  // TODO(bhsueh) remove this flag
+        const bool is_final     = false;
         T*         layer_input  = decoder_layer_output_;
         T*         layer_output = decoder_layer_output_;
         if (!is_unpadded_mha) {
@@ -279,6 +278,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
             ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+            sync_check_cuda_error();
         }
 
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
@@ -296,10 +296,11 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
-                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
+                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len)},
                     attention_mask}},
             {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
-            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
+            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
+            {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}};
 
         if (is_unpadded_mha) {
             self_attention_input_tensors.insert("padding_offset",
@@ -355,6 +356,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
             ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
+            sync_check_cuda_error();
         }
 
         if ((l == num_layer_ - 1) && is_unpadded_mha) {
@@ -364,6 +366,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                  h_token_num,
                                  head_num_ * size_per_head_,
                                  stream_);
+            sync_check_cuda_error();
         }
     }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 08449b679..7be46c7ed 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -17,22 +17,20 @@
 #include "src/fastertransformer/th_op/llama/LLaMA.h"
 
 namespace th = torch;
-namespace ft = fastertransformer;
 namespace torch_ext {
 
-LLaMA::LLaMA(const int64_t            head_num,
-                     const int64_t            size_per_head,
-                     const int64_t            inter_size,
-                     const int64_t            layer_num,
-                     const int64_t            vocab_size,
-                     const int64_t            rotary_embedding_dim,
-                     const int64_t            start_id,
-                     const int64_t            end_id,
-                     const int64_t            tensor_para_size,
-                     const int64_t            pipeline_para_size,
-                     const int64_t            max_seq_len,
-                     const bool               use_gptj_residual,
-                     const vector<th::Tensor> weights):
+LLaMA::LLaMA(const int64_t            num_heads,
+             const int64_t            size_per_head,
+             const int64_t            inter_size,
+             const int64_t            num_layers,
+             const int64_t            vocab_size,
+             const int64_t            rotary_embedding_dim,
+             const int64_t            random_seed,
+             const int64_t            max_seq_len,
+             const int64_t            tensor_para_size,
+             const int64_t            pipeline_para_size,
+             const vector<th::Tensor> weights):
+    vocab_size_(vocab_size),
     st_(weights[0].scalar_type())
 {
     for (auto t : weights) {
@@ -41,33 +39,29 @@ LLaMA::LLaMA(const int64_t            head_num,
 
     switch (st_) {
         case at::ScalarType::Float:
-            ftllama = new FTLLaMA<float>((size_t)head_num,
+            ftllama = new FTLLaMA<float>((size_t)num_heads,
                                          (size_t)size_per_head,
                                          (size_t)inter_size,
-                                         (size_t)layer_num,
+                                         (size_t)num_layers,
                                          (size_t)vocab_size,
                                          (size_t)rotary_embedding_dim,
-                                         start_id,
-                                         end_id,
+                                         (size_t)random_seed,
+                                         (size_t)max_seq_len,
                                          tensor_para_size,
                                          pipeline_para_size,
-                                         (size_t)max_seq_len,
-                                         use_gptj_residual,
                                          weights);
             break;
         case at::ScalarType::Half:
-            ftllama = new FTLLaMA<half>((size_t)head_num,
+            ftllama = new FTLLaMA<half>((size_t)num_heads,
                                         (size_t)size_per_head,
                                         (size_t)inter_size,
-                                        (size_t)layer_num,
+                                        (size_t)num_layers,
                                         (size_t)vocab_size,
                                         (size_t)rotary_embedding_dim,
-                                        start_id,
-                                        end_id,
+                                        (size_t)random_seed,
+                                        (size_t)max_seq_len,
                                         tensor_para_size,
                                         pipeline_para_size,
-                                        (size_t)max_seq_len,
-                                        use_gptj_residual,
                                         weights);
             break;
         default:
@@ -80,18 +74,8 @@ LLaMA::~LLaMA()
     delete ftllama;
 }
 
-std::vector<th::Tensor> LLaMA::forward(th::Tensor               input_ids,
-                                           th::Tensor               input_lengths,
-                                           const int64_t            output_len,
-                                           th::optional<int64_t>    beam_width_opt,
-                                           th::optional<th::Tensor> top_k_opt,
-                                           th::optional<th::Tensor> top_p_opt,
-                                           th::optional<th::Tensor> beam_search_diversity_rate_opt,
-                                           th::optional<th::Tensor> temperature_opt,
-                                           th::optional<th::Tensor> len_penalty_opt,
-                                           th::optional<th::Tensor> repetition_penalty_opt,
-                                           th::optional<th::Tensor> random_seed_opt,
-                                           th::optional<int64_t>    return_cum_log_probs_opt)
+th::Tensor
+LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos)
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -99,45 +83,13 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor               input_ids,
     CHECK_TH_CUDA(input_lengths);
     CHECK_CONTIGUOUS(input_lengths);
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
-    int64_t return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int64_t)return_cum_log_probs_opt.value() : 0;
-    if (return_cum_log_probs_opt.has_value()) {
-        TORCH_CHECK(return_cum_log_probs == 0 || return_cum_log_probs == 1,
-                    "return_cum_log_probs should be"
-                    " 0 (no return cum_log_probs), "
-                    " 1 (the cumulative log probs of generated sequences)")
-    }
 
-    const int beam_width = beam_width_opt.has_value() ? (int)beam_width_opt.value() : 1;
-
-    const int  batch_size               = input_ids.size(0);
-    const int  max_input_length         = input_ids.size(1);
-    const int  total_request_output_len = max_input_length + output_len;
-    th::Tensor output_ids               = torch::empty({batch_size, beam_width, total_request_output_len},
-                                         torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
-    th::Tensor sequence_lengths =
-        torch::empty({batch_size, beam_width}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
-    th::Tensor cum_log_probs =
-        torch::empty({batch_size, beam_width}, torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
-
-    ftllama->forward(input_ids,
-                   input_lengths,
-                   output_ids,
-                   sequence_lengths,
-                   cum_log_probs,
-                   (const size_t)output_len,
-                   (const size_t)beam_width,
-                   top_k_opt,
-                   top_p_opt,
-                   beam_search_diversity_rate_opt,
-                   temperature_opt,
-                   len_penalty_opt,
-                   repetition_penalty_opt,
-                   random_seed_opt,
-                   return_cum_log_probs_opt);
-    if (return_cum_log_probs > 0) {
-        return std::vector<th::Tensor>{output_ids, sequence_lengths, cum_log_probs};
-    }
-    return std::vector<th::Tensor>{output_ids, sequence_lengths};
+    const int  batch_size    = input_ids.size(0);
+    const int  seq_len       = input_ids.size(1);
+    th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_},
+                                            torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
+    ftllama->forward(output_logits, input_ids, input_lengths, start_pos);
+    return output_logits;
 }
 
 }  // namespace torch_ext
@@ -158,7 +110,5 @@ static auto fasterTransformerGptTHS =
                               int64_t,
                               int64_t,
                               int64_t,
-                              int64_t,
-                              bool,
                               std::vector<th::Tensor>>())
         .def("forward", &torch_ext::LLaMA::forward);
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 9a5efa3d0..0d97dc322 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -29,51 +29,35 @@ using std::vector;
 class IFLLaMA {
 public:
     virtual ~IFLLaMA() {}
-    virtual void forward(th::Tensor&              input_ids,
-                         th::Tensor&              input_lengths,
-                         th::Tensor&              output_ids,
-                         th::Tensor&              sequence_lengths,
-                         th::Tensor&              cum_log_probs,
-                         const size_t             request_output_len,
-                         const size_t             beam_width,
-                         th::optional<th::Tensor> top_k_opt,
-                         th::optional<th::Tensor> top_p_opt,
-                         th::optional<th::Tensor> beam_search_diversity_rate_opt,
-                         th::optional<th::Tensor> temperature_opt,
-                         th::optional<th::Tensor> len_penalty_opt,
-                         th::optional<th::Tensor> repetition_penalty_opt,
-                         th::optional<th::Tensor> random_seed_opt,
-                         th::optional<int64_t>    return_cum_log_probs_opt) = 0;
+    virtual void
+    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) = 0;
 };
 
 template<typename T>
 class FTLLaMA: public IFLLaMA {
 public:
-    FTLLaMA(const size_t             head_num,
+    FTLLaMA(const size_t             num_heads,
             const size_t             size_per_head,
             const size_t             inter_size,
-            const size_t             layer_num,
+            const size_t             num_layers,
             const size_t             vocab_size,
             const size_t             rotary_embedding_dim,
-            const int                start_id,
-            const int                end_id,
+            const size_t             random_seed,
+            const size_t             max_seq_len,
             const int64_t            tensor_para_size,
             const int64_t            pipeline_para_size,
-            const size_t             max_seq_len,
-            const bool               use_gptj_residual,
             const vector<th::Tensor> weights):
-        head_num_(head_num),
+        num_heads_(num_heads),
         size_per_head_(size_per_head),
         inter_size_(inter_size),
-        layer_num_(layer_num),
+        num_layers_(num_layers),
         vocab_size_(vocab_size),
         rotary_embedding_dim_(rotary_embedding_dim),
-        start_id_(start_id),
-        end_id_(end_id),
-        use_gptj_residual_(use_gptj_residual),
-        weights_(weights),
+        random_seed_(random_seed),
+        max_seq_len_(max_seq_len),
         tensor_para_size_(tensor_para_size),
-        pipeline_para_size_(pipeline_para_size)
+        pipeline_para_size_(pipeline_para_size),
+        weights_(weights)
     {
         ft::check_cuda_error(cublasLtCreate(&cublasltHandle_));
         cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
@@ -81,40 +65,42 @@ class FTLLaMA: public IFLLaMA {
 
         ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size);
 
-        llama_weights_.resizeLayer(layer_num_);
-        for (int i = 0; i < (int)layer_num_; i++) {
+        llama_weights_.resizeLayer(num_layers_);
+        for (int i = 0; i < (int)num_layers_; i++) {
             llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
-                get_ptr<T>(weights_[i + 0 * layer_num_]);
+                get_ptr<T>(weights_[i + 0 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
-                get_ptr<T>(weights_[i + 1 * layer_num_]);
+                get_ptr<T>(weights_[i + 1 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
-                get_ptr<T>(weights_[i + 2 * layer_num_]);
+                get_ptr<T>(weights_[i + 2 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
-                get_ptr<T>(weights_[i + 3 * layer_num_]);
+                get_ptr<T>(weights_[i + 3 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
-                get_ptr<T>(weights_[i + 4 * layer_num_]);
+                get_ptr<T>(weights_[i + 4 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
-                get_ptr<T>(weights_[i + 5 * layer_num_]);
+                get_ptr<T>(weights_[i + 5 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
-                get_ptr<T>(weights_[i + 6 * layer_num_]);
+                get_ptr<T>(weights_[i + 6 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
-                get_ptr<T>(weights_[i + 7 * layer_num_]);
+                get_ptr<T>(weights_[i + 7 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
-                get_ptr<T>(weights_[i + 8 * layer_num_]);
+                get_ptr<T>(weights_[i + 8 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
-                get_ptr<T>(weights_[i + 9 * layer_num_]);
+                get_ptr<T>(weights_[i + 9 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.kernel =
+                get_ptr<T>(weights_[i + 10 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.bias =
+                get_ptr<T>(weights_[i + 11 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
-                get_ptr<T>(weights_[i + 10 * layer_num_]);
+                get_ptr<T>(weights_[i + 12 * num_layers_]);
             llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
-                get_ptr<T>(weights_[i + 11 * layer_num_]);
+                get_ptr<T>(weights_[i + 13 * num_layers_]);
         }
 
-        llama_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[12 * layer_num_ + 0]);
-        llama_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[12 * layer_num_ + 1]);
-        llama_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[12 * layer_num_ + 2]);
-        llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[12 * layer_num_ + 3]);
-
-        //llama_weights_.setMaxSeqLen(max_seq_len);
+        llama_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[14 * num_layers_ + 0]);
+        llama_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[14 * num_layers_ + 1]);
+        llama_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[14 * num_layers_ + 2]);
+        llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
     }
@@ -128,24 +114,11 @@ class FTLLaMA: public IFLLaMA {
         delete cublas_wrapper_mutex_;
     }
 
-    void forward(th::Tensor&              input_ids,
-                 th::Tensor&              input_lengths,
-                 th::Tensor&              output_ids,
-                 th::Tensor&              sequence_lengths,
-                 th::Tensor&              cum_log_probs,
-                 const size_t             request_output_len,
-                 const size_t             beam_width,
-                 th::optional<th::Tensor> top_k_opt,
-                 th::optional<th::Tensor> top_p_opt,
-                 th::optional<th::Tensor> beam_search_diversity_rate_opt,
-                 th::optional<th::Tensor> temperature_opt,
-                 th::optional<th::Tensor> len_penalty_opt,
-                 th::optional<th::Tensor> repetition_penalty_opt,
-                 th::optional<th::Tensor> random_seed_opt,
-                 th::optional<int64_t>    return_cum_log_probs_opt) override
+    virtual void forward(th::Tensor&   output_logits,
+                         th::Tensor&   input_ids,
+                         th::Tensor&   input_lengths,
+                         const int64_t start_pos) override
     {
-        int return_cum_log_probs = return_cum_log_probs_opt.has_value() ? (int)return_cum_log_probs_opt.value() : 0;
-
         auto           stream       = at::cuda::getCurrentCUDAStream().stream();
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
         cublasSetStream(cublasHandle, stream);
@@ -161,8 +134,7 @@ class FTLLaMA: public IFLLaMA {
         }
 
         const size_t request_batch_size = (size_t)input_ids.size(0);
-        const size_t max_input_length   = (size_t)input_ids.size(1);
-        const int    total_output_len   = (int)(max_input_length + request_output_len);
+        const size_t seq_len            = (size_t)input_ids.size(1);
 
         ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
                                                                    ft::getSMVersion(),
@@ -172,85 +144,41 @@ class FTLLaMA: public IFLLaMA {
                                                                    false,  // with_relative_position_bias
                                                                    true);  // causal_mask
 
-        ft::LLaMA<T>          llama = ft::LLaMA<T>(head_num_,
+        ft::LLaMA<T> llama = ft::LLaMA<T>(num_heads_,
                                           size_per_head_,
                                           inter_size_,
-                                          layer_num_,
+                                          num_layers_,
                                           vocab_size_,
                                           rotary_embedding_dim_,
-                                          0,  // random_seed,
+                                          random_seed_,
+                                          max_seq_len_,
+                                          tensor_para_,
+                                          pipeline_para_,
                                           stream,
                                           &cublas_wrapper,
                                           &allocator,
                                           false,          // is_free_buffer_after_forward
                                           &prop_,         // cuda_device_prop
                                           attention_type  // attention_type
-                                    );
-        std::vector<uint32_t> output_seq_len(request_batch_size, total_output_len);
+        );
 
         std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"input_ids",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_INT32,
-                        std::vector<size_t>{request_batch_size, max_input_length},
+                        std::vector<size_t>{request_batch_size, seq_len},
                         get_ptr<int>(input_ids)}},
             {"input_lengths",
              ft::Tensor{
                  ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{request_batch_size}, get_ptr<int>(input_lengths)}},
-            {"output_seq_len",
-             ft::Tensor{
-                 ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}}};
-        if (beam_width > 1 && beam_search_diversity_rate_opt.has_value()) {
-            input_tensors.insert(
-                {"beam_search_diversity_rate",
-                 convert_tensor<float>(beam_search_diversity_rate_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (top_p_opt.has_value()) {
-            input_tensors.insert(
-                {"runtime_top_p", convert_tensor<float>(top_p_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (top_k_opt.has_value()) {
-            input_tensors.insert(
-                {"runtime_top_k", convert_tensor<uint>(top_k_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (temperature_opt.has_value()) {
-            input_tensors.insert(
-                {"temperature", convert_tensor<float>(temperature_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (len_penalty_opt.has_value()) {
-            input_tensors.insert(
-                {"len_penalty", convert_tensor<float>(len_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (repetition_penalty_opt.has_value()) {
-            input_tensors.insert({"repetition_penalty",
-                                  convert_tensor<float>(repetition_penalty_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
-        if (random_seed_opt.has_value()) {
-            input_tensors.insert(
-                {"random_seed",
-                 convert_tensor<unsigned long long int>(random_seed_opt.value(), ft::MemoryType::MEMORY_CPU)});
-        }
+            {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector<size_t>{1}, &start_pos}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
-            {"output_ids",
-             ft::Tensor{ft::MEMORY_GPU,
-                        ft::TYPE_INT32,
-                        std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
-                        get_ptr<int>(output_ids)}},
-            {"sequence_length",
+            {"output_logits",
              ft::Tensor{ft::MEMORY_GPU,
-                        ft::TYPE_INT32,
-                        std::vector<size_t>{request_batch_size, beam_width},
-                        get_ptr<int>(sequence_lengths)}}};
-
-        if (return_cum_log_probs > 0) {
-            output_tensors.insert({"cum_log_probs",
-                                   ft::Tensor{ft::MEMORY_GPU,
-                                              ft::TYPE_FP32,
-                                              std::vector<size_t>{request_batch_size, beam_width},
-                                              get_ptr<float>(cum_log_probs)}});
-        }
-
+                        ft::TYPE_FP32,
+                        std::vector<size_t>{request_batch_size, seq_len, vocab_size_},
+                        get_ptr<float>(output_logits)}}};
         try {
             llama.forward(&output_tensors, &input_tensors, &llama_weights_);
         }
@@ -265,17 +193,16 @@ class FTLLaMA: public IFLLaMA {
     }
 
 private:
-    const size_t head_num_;
+    const size_t num_heads_;
     const size_t size_per_head_;
     const size_t inter_size_;
-    const size_t layer_num_;
+    const size_t num_layers_;
     const size_t vocab_size_;
     const size_t rotary_embedding_dim_;
-    const int    start_id_;
-    const int    end_id_;
-    const bool   use_gptj_residual_;
-
-    // const ft::gptVariantParams gpt_variant_params_;
+    const size_t random_seed_;
+    const size_t max_seq_len_;
+    int64_t      tensor_para_size_;
+    int64_t      pipeline_para_size_;
 
     std::vector<th::Tensor> weights_;
     cublasLtHandle_t        cublasltHandle_;
@@ -286,44 +213,29 @@ class FTLLaMA: public IFLLaMA {
 
     ft::NcclParam tensor_para_;
     ft::NcclParam pipeline_para_;
-
-    int64_t tensor_para_size_;
-    int64_t pipeline_para_size_;
 };
 
 class LLaMA: public th::jit::CustomClassHolder {
 public:
-    LLaMA(const int64_t            head_num,
+    LLaMA(const int64_t            num_heads,
           const int64_t            size_per_head,
           const int64_t            inter_size,
-          const int64_t            layer_num,
+          const int64_t            num_layers,
           const int64_t            vocab_size,
           const int64_t            rotary_embedding_dim,
-          const int64_t            start_id,
-          const int64_t            end_id,
+          const int64_t            random_seed,
+          const int64_t            max_seq_len,
           const int64_t            tensor_para_size,
           const int64_t            pipeline_para_size,
-          const int64_t            max_seq_len,
-          const bool               use_gptj_residual,
           const vector<th::Tensor> weights);
 
     ~LLaMA();
 
-    vector<th::Tensor> forward(th::Tensor               input_ids,
-                               th::Tensor               input_lengths,
-                               const int64_t            output_len,
-                               th::optional<int64_t>    beam_width_opt,
-                               th::optional<th::Tensor> top_k_opt,
-                               th::optional<th::Tensor> top_p_opt,
-                               th::optional<th::Tensor> beam_search_diversity_rate_opt,
-                               th::optional<th::Tensor> temperature_opt,
-                               th::optional<th::Tensor> len_penalty_opt,
-                               th::optional<th::Tensor> repetition_penalty_opt,
-                               th::optional<th::Tensor> random_seed_opt,
-                               th::optional<int64_t>    return_cum_log_probs_opt);
+    th::Tensor forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos);
 
 private:
     const at::ScalarType    st_;
+    size_t                  vocab_size_;
     IFLLaMA*                ftllama;
     std::vector<th::Tensor> weights;
 };
diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu
index 134224a09..d795cbf99 100644
--- a/src/fastertransformer/utils/memory_utils.cu
+++ b/src/fastertransformer/utils/memory_utils.cu
@@ -177,7 +177,7 @@ __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
 template<typename T_OUT, typename T_IN>
 void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
 {
-    cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
+    cudaCast<<<(size + 255) / 256, 256, 0, stream>>>(dst, src, size);
 }
 
 template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);

From d403986a3691d4f2ab9046727126a114436dbb21 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 23 Sep 2023 21:45:26 +0000
Subject: [PATCH 26/55] debug code

---
 src/fastertransformer/kernels/gpt_kernels.cu  | 435 +++++++++++++++---
 src/fastertransformer/kernels/gpt_kernels.h   |   9 +
 .../LLaMAContextAttentionLayer.cc             | 111 +++--
 src/fastertransformer/models/llama/LLaMA.cc   | 129 +++++-
 .../models/llama/LLaMAContextDecoder.cc       | 114 ++++-
 .../models/llama/LLaMAContextDecoder.h        |   1 -
 src/fastertransformer/th_op/llama/LLaMA.cc    |   2 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |   6 +-
 8 files changed, 683 insertions(+), 124 deletions(-)

diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu
index 7dc9af620..76852dc4d 100644
--- a/src/fastertransformer/kernels/gpt_kernels.cu
+++ b/src/fastertransformer/kernels/gpt_kernels.cu
@@ -114,7 +114,70 @@ __global__ void start_id_embedding_position_lookups_kernel(T*
                                                                                                      length,           \
                                                                                                      max_length,       \
                                                                                                      batch_size,       \
-                                                                                                     hidden_units);
+                                                                                                     hidden_units)
+template<typename T>
+__global__ void start_id_embedding_lookups_kernel(T*            from_tensor,
+                                                  const T*      embedding_table,
+                                                  const int*    input_ids,
+                                                  const int     length,
+                                                  const int     batch_size,
+                                                  const int64_t hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units;
+         index += blockDim.x * gridDim.x) {
+
+        // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate
+        // embedding [batch, length, hidden]
+        const int word_index      = index / hidden_units;
+        const int word_index_row  = word_index / length;  // batch_id
+        const int word_index_col  = word_index % length;
+        const int real_word_index = word_index_row * length + word_index_col;
+        const int col_index       = index % hidden_units;
+        const int input_id        = input_ids == nullptr ? real_word_index : input_ids[real_word_index];
+
+        from_tensor[index] = embedding_table[input_id * hidden_units + col_index];
+    }
+}
+
+template<typename T>
+void invokeInputIdsEmbeddingLookup(T*           from_tensor,
+                                   const T*     embedding_table,
+                                   const int*   input_ids,
+                                   const int    length,
+                                   const int    batch_size,
+                                   const int    hidden_units,
+                                   cudaStream_t stream)
+{
+    dim3 grid(min(batch_size * length, 65536));
+    dim3 block(min(hidden_units, 512));
+    start_id_embedding_lookups_kernel<T>
+        <<<grid, block, 0, stream>>>(from_tensor, embedding_table, input_ids, length, batch_size, hidden_units);
+}
+
+template void invokeInputIdsEmbeddingLookup(float*       from_tensor,
+                                            const float* embedding_table,
+                                            const int*   input_ids,
+                                            const int    length,
+                                            const int    batch_size,
+                                            const int    hidden_units,
+                                            cudaStream_t stream);
+template void invokeInputIdsEmbeddingLookup(half*        from_tensor,
+                                            const half*  embedding_table,
+                                            const int*   input_ids,
+                                            const int    length,
+                                            const int    batch_size,
+                                            const int    hidden_units,
+                                            cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokeInputIdsEmbeddingLookup(__nv_bfloat16*       from_tensor,
+                                            const __nv_bfloat16* embedding_table,
+                                            const int*           input_ids,
+                                            const int            length,
+                                            const int            batch_size,
+                                            const int            hidden_units,
+                                            cudaStream_t         stream);
+#endif
 
 template<typename T>
 void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
@@ -203,27 +266,89 @@ template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16*
 template<typename T>
 __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param)
 {
-    // 1. Copy the input ids to output ids and transpose output ids to [seq_len, batch_size, beam_width].
-    // 2. Embedding lookup by input ids and concat with soft prompt. The axis of concatenation is on axis of seq_len.
-
-    // Assume batch size is 2 and prompts are [[t1, t2], [t3], [t4, t5]], input_ids are [[s1, s2], [s3], [s4]]
-    // then the order of output_ids is
-    // [ [?, ?, s1, s2]
-    //   [?, s3, padding, padding]
-    //   [?, ?, s4, padding] ]
-    // and the order of embedding is
-    // [ [t1, t2, s1, s2]
-    //   [t3, s3, padding, padding]
-    //   [t4, t5, s4, padding] ]
-    // where "?" means undefined values and we should attach it.
+    // 1. Copy the
+    // input ids to
+    // output ids
+    // and
+    // transpose
+    // output ids
+    // to [seq_len,
+    // batch_size,
+    // beam_width].
+    // 2. Embedding
+    // lookup by
+    // input ids
+    // and concat
+    // with soft
+    // prompt. The
+    // axis of
+    // concatenation
+    // is on axis
+    // of seq_len.
+
+    // Assume batch
+    // size is 2
+    // and prompts
+    // are [[t1,
+    // t2], [t3],
+    // [t4, t5]],
+    // input_ids
+    // are [[s1,
+    // s2], [s3],
+    // [s4]] then
+    // the order of
+    // output_ids
+    // is [ [?, ?,
+    // s1, s2]
+    //   [?, s3,
+    //   padding,
+    //   padding]
+    //   [?, ?, s4,
+    //   padding] ]
+    // and the
+    // order of
+    // embedding is
+    // [ [t1, t2,
+    // s1, s2]
+    //   [t3, s3,
+    //   padding,
+    //   padding]
+    //   [t4, t5,
+    //   s4,
+    //   padding] ]
+    // where "?"
+    // means
+    // undefined
+    // values and
+    // we should
+    // attach it.
 
     for (int index = blockIdx.x * blockDim.x + threadIdx.x;
          index < param.batch_size * param.beam_width * (param.max_prefix_soft_prompt_length + param.max_input_length)
                      * param.hidden_units;
          index += blockDim.x * gridDim.x) {
-        // transpose the input_ids [batch, length] (part of [batch, beam, max_input_length]) to
-        // output_ids [length, batch, beam].
-        // ouptut_ids need to add padding in the beginning for soft prompting.
+        // transpose
+        // the
+        // input_ids
+        // [batch,
+        // length]
+        // (part of
+        // [batch,
+        // beam,
+        // max_input_length])
+        // to
+        // output_ids
+        // [length,
+        // batch,
+        // beam].
+        // ouptut_ids
+        // need to
+        // add
+        // padding
+        // in the
+        // beginning
+        // for soft
+        // prompting.
 
         if (index < param.batch_size * param.beam_width * param.max_input_length) {
             int       tmp_index = index;
@@ -239,21 +364,43 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo
             }
         }
 
-        // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab,
-        // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length +
-        // max_prefix_soft_prompt_length, hidden]
-        int       tmp_index = index;
-        const int hidden_id = tmp_index % param.hidden_units;
-        tmp_index           = (tmp_index - hidden_id) / param.hidden_units;
-        const int seq_id    = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
-        tmp_index           = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
-        const int beam_id   = tmp_index % param.beam_width;
-        tmp_index           = (tmp_index - beam_id) / param.beam_width;
-        const int batch_id  = tmp_index % param.batch_size;
+        // embedding
+        // lookup
+        // from
+        // word ids
+        // [batch,
+        // beam,
+        // length]
+        // (part of
+        // [batch,
+        // beam,
+        // max_input_length]),
+        // [vocab,
+        // hidden]
+        // and
+        // [batch,
+        // max_prefix_soft_prompt_length,
+        // hidden]
+        // to
+        // generate
+        // embedding
+        // [batch,
+        // beam,
+        // length +
+        // max_prefix_soft_prompt_length,
+        // hidden]
+        int       tmp_index    = index;
+        const int hidden_id    = tmp_index % param.hidden_units;
+        tmp_index              = (tmp_index - hidden_id) / param.hidden_units;
+        const int seq_id       = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
+        tmp_index              = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
+        const int beam_id      = tmp_index % param.beam_width;
+        tmp_index              = (tmp_index - beam_id) / param.beam_width;
+        const int     batch_id = tmp_index % param.batch_size;
         const int64_t hidden_units = param.hidden_units;
-        T         embedding =
+        T             embedding =
             (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
-                        (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+                            (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
                                                       + seq_id * hidden_units + hidden_id] :
                             param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
                                                       + beam_id * param.max_input_length
@@ -292,7 +439,8 @@ template void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(
     inputIdsEmbeddingLookupPosEncodingSoftPromptParam<__nv_bfloat16> param);
 #endif
 
-// TODO Add half2 implementation
+// TODO Add half2
+// implementation
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2)
 {
@@ -329,9 +477,11 @@ invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const i
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1)
 {
-    // out: [dim1, dim0]
-    // in: [dim0, dim1]
-    // in_skipping_dim1: [dim1]
+    // out: [dim1,
+    // dim0] in:
+    // [dim0, dim1]
+    // in_skipping_dim1:
+    // [dim1]
 
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     if (index < dim0 * dim1) {
@@ -363,8 +513,15 @@ __global__ void buildDecoderAttentionMaskKernel(T*         attention_mask,
                                                 const int  max_seq_len,
                                                 const int  max_prompt_length)
 {
-    // sequence_lengths: [batch_size]
-    // attention_mask: [batch_size, 1, max_seq_len, max_seq_len + max_prompt_length]
+    // sequence_lengths:
+    // [batch_size]
+    // attention_mask:
+    // [batch_size,
+    // 1,
+    // max_seq_len,
+    // max_seq_len
+    // +
+    // max_prompt_length]
     const int max_prompt_seq_length = max_seq_len + max_prompt_length;
     const int mask_size_per_seq     = max_seq_len * max_prompt_seq_length;
     attention_mask += blockIdx.x * mask_size_per_seq;
@@ -581,29 +738,100 @@ template<int TB_SIZE>
 __global__ void
 find_context_dups(int* shared_contexts, const int* input_ids, const size_t batch_size, const size_t input_seq_len)
 {
-    /* We compare all context pairs (i, j), with i (tgt) < j (src) , to detect duplicate
-     * inputs. If there's a match between i and j, we store i at the
-     * j-th position of shared_context. So that we know that j can be
-     * represented by i. shared_contexts is initialized like shared_contexts[i] = i
-     * and when there's a match, we actually use shared_contexts[j] = min(shared_contexts[j], i)
-     * so that in the end, shared_contexts effectively contains an index
-     * to the match with the lowest index context.
-     * Note that shared_contexts[i] <= i, a property that will be used when uncompacting
+    /* We compare
+     * all context
+     * pairs (i,
+     * j), with i
+     * (tgt) < j
+     * (src) , to
+     * detect
+     * duplicate
+     * inputs. If
+     * there's a
+     * match
+     * between i
+     * and j, we
+     * store i at
+     * the j-th
+     * position of
+     * shared_context.
+     * So that we
+     * know that j
+     * can be
+     * represented
+     * by i.
+     * shared_contexts
+     * is
+     * initialized
+     * like
+     * shared_contexts[i]
+     * = i and when
+     * there's a
+     * match, we
+     * actually use
+     * shared_contexts[j]
+     * =
+     * min(shared_contexts[j],
+     * i) so that
+     * in the end,
+     * shared_contexts
+     * effectively
+     * contains an
+     * index to the
+     * match with
+     * the lowest
+     * index
+     * context.
+     * Note that
+     * shared_contexts[i]
+     * <= i, a
+     * property
+     * that will be
+     * used when
+     * uncompacting
      * inputs.
      */
     typedef cub::BlockReduce<int, TB_SIZE>       BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     __shared__ bool                              match;
 
-    /* Each block is responsible for a (i, j) pair. To map the block space to
-     * the i < j space, we need to convert a linear addressing to a triangle, of
-     * size (batch_size * (batch_size - 1)) / 2
-     * For more information, check https://en.wikipedia.org/wiki/Triangular_number
+    /* Each block
+     * is
+     * responsible
+     * for a (i, j)
+     * pair. To map
+     * the block
+     * space to the
+     * i < j space,
+     * we need to
+     * convert a
+     * linear
+     * addressing
+     * to a
+     * triangle, of
+     * size
+     * (batch_size
+     * * (batch_size - 1)) / 2
+     * For more
+     * information,
+     * check
+     * https://en.wikipedia.org/wiki/Triangular_number
      */
 
-    // blockIdx = [0, 1, 2, ... n(n-1)/2] -> base_index = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, ..., n - 2]
+    // blockIdx =
+    // [0, 1, 2,
+    // ...
+    // n(n-1)/2] ->
+    // base_index =
+    // [0, 1, 1, 2,
+    // 2, 2, 3, 3,
+    // 3, 3, ..., n
+    // - 2]
     const int base_index = floorf(0.5f * (sqrtf(1 + 8 * blockIdx.x) - 1));
-    const int src_idx    = base_index + 1;  // base_index \in [1, batch_size)
+    const int src_idx    = base_index + 1;  // base_index
+                                            // \in
+                                            // [1,
+                                            // batch_size)
 
     const int rev_base_index = base_index * (base_index + 1) / 2;
     const int tgt_idx        = blockIdx.x - rev_base_index;  // tgt_idx \in [0, src_idx)
@@ -659,9 +887,19 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
 
         if (!masked && is_first_occur) {
             int compact_idx = scan + (first_iter ? 0 : scan_offset);
-            // Context rep. writes initial index
+            // Context
+            // rep.
+            // writes
+            // initial
+            // index
             batch_to_compact[seq_idx * beam_width] = compact_idx;
-            // input ids are tiled in context part
+            // input
+            // ids
+            // are
+            // tiled
+            // in
+            // context
+            // part
             compact_to_batch[compact_idx] = seq_idx * beam_width;
         }
 
@@ -674,13 +912,27 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
         __syncthreads();
 
         if (!masked && !is_first_occur) {
-            // Fill the rest of batch_to_compact based on what rep. wrote
+            // Fill
+            // the
+            // rest
+            // of
+            // batch_to_compact
+            // based
+            // on
+            // what
+            // rep.
+            // wrote
             const int src_idx                      = batch_to_compact[shared_contexts[seq_idx] * beam_width];
             batch_to_compact[seq_idx * beam_width] = src_idx;
         }
 
         if (!masked) {
-            // set same compact idx for beams
+            // set
+            // same
+            // compact
+            // idx
+            // for
+            // beams
             for (int beam_id = 1; beam_id < beam_width; ++beam_id) {
                 batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width];
             }
@@ -713,11 +965,20 @@ void invokeFindContextDups(int*         shared_contexts,
 {
     dim3 block{512};
     dim3 grid{((int)batch_size + block.x - 1) / block.x};
-    // set shared_context[i] = i
+    // set
+    // shared_context[i] =
+    // i
     init_shared_contexts<<<grid, block, 0, stream>>>(shared_contexts, batch_size);
 
     grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2};
-    // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]}
+    // set
+    // shared_contexts[i]
+    // = j, where j
+    // = min{k,
+    // such that
+    // input_ids[k]
+    // ==
+    // input_ids[i]}
     if (input_seq_len <= 128) {
         block = 128;
         find_context_dups<128><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
@@ -727,8 +988,21 @@ void invokeFindContextDups(int*         shared_contexts,
         find_context_dups<256><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
     }
 
-    // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch
-    // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j]
+    // set
+    // batch_to_compact[i]
+    // = j, where j
+    // is the
+    // position of
+    // input_ids[i]
+    // in the
+    // compact_batch
+    // set
+    // compact_to_batch[i]
+    // = j, where j
+    // is such that
+    // compact_to_batch[i]
+    // =
+    // input_ids[j]
     generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>(
         batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len);
 }
@@ -782,10 +1056,29 @@ void invokeCompactInputs(T*           compact_input,
                          size_t       hidden_dimension,
                          cudaStream_t stream)
 {
-    /* Compact relevant decoder_layer inputs based on the identical contexts.
-     * For example, decoder_input is [batch_size, seq_len, H]. It's compacted
-     * into compact_input [compact_size, seq_len, H] such that
-     * compact_input[i, ...] = decoder_input[compact_idx[i], ...] */
+    /* Compact
+     * relevant
+     * decoder_layer
+     * inputs based
+     * on the
+     * identical
+     * contexts.
+     * For example,
+     * decoder_input
+     * is
+     * [batch_size,
+     * seq_len, H].
+     * It's
+     * compacted
+     * into
+     * compact_input
+     * [compact_size,
+     * seq_len, H]
+     * such that
+     * compact_input[i,
+     * ...] =
+     * decoder_input[compact_idx[i],
+     * ...] */
     const size_t elems_n = compact_size * seq_len * max(hidden_dimension, seq_len);
     const dim3   blockDim(512);
     const dim3   gridDim((elems_n + 512 - 1) / 512);
@@ -828,8 +1121,19 @@ __global__ void uncompact_outputs(T*         uncompact_buffer,
                                   size_t     batch_size,
                                   size_t     buffer_stride)
 {
-    /* Uncompact a buffer IN of size [Compact, Stride] into OUT of size [Batch, Stride]
-     * so that \forall i, OUT[i, :] = IN[batch_to_compact_idx[i], :]
+    /* Uncompact a
+     * buffer IN of
+     * size
+     * [Compact,
+     * Stride] into
+     * OUT of size
+     * [Batch,
+     * Stride] so
+     * that \forall
+     * i, OUT[i, :]
+     * =
+     * IN[batch_to_compact_idx[i],
+     * :]
      */
     const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -1124,4 +1428,5 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16);
 #endif
 #undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION
 
-}  // namespace fastertransformer
+}  // namespace
+   // fastertransformer
diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h
index d78224e0a..bf4963231 100644
--- a/src/fastertransformer/kernels/gpt_kernels.h
+++ b/src/fastertransformer/kernels/gpt_kernels.h
@@ -59,6 +59,15 @@ struct pPromptTuningParam {
     const T* request_prompt_embedding = nullptr;
 };
 
+template<typename T>
+void invokeInputIdsEmbeddingLookup(T*                    from_tensor,
+                                   const T*              embedding_table,
+                                   const int*            input_ids,
+                                   const int             length,
+                                   const int             batch_size,
+                                   const int             hidden_units,
+                                   cudaStream_t          stream);
+
 template<typename T>
 void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
                                               int*                  output_ids,
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index f0dfce8c7..a1cb9b81f 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -48,7 +48,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     const int  layer_id       = input_tensors->getVal<int>("layer_id");
     const int* padding_offset = input_tensors->getPtr<int>("padding_offset", nullptr);
     int*       cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    size_t     start_pos      = input_tensors->at("start_pos").max<uint32_t>();
+    int        start_pos      = input_tensors->at("start_pos").max<int>();
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -79,13 +79,54 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-    // IDEA: append prefix prompt key value here
-    PrefixPromptBatchWeightsParam<T> param{nullptr, nullptr, 0, (size_t)layer_id * 2 * head_num_ * size_per_head_};
+    /*
+    if (layer_id < 15) {
+        T* out = (T*)malloc(sizeof(T) * m * 3 * hidden_units_);
+        T *tmp = out;
+        cudaMemcpy(
+            out, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost);
+        for (int i = 0; i < 3; ++i) {
+            for (int b = 0; b < batch_size; ++b) {
+                std::cout << "[\n";
+                for (int s = 0; s < 3; ++s) {
+                    std::cout << "[ ";
+                    for (int h = 0; h < 3; ++h) {
+                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
+                    }
+                    std::cout << " ... ";
+                    for (int h = hidden_units_-3; h < hidden_units_; ++h) {
+                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
+                    }
+                    std::cout << "]\n";
+                }
+                std::cout << "...\n";
+                for (int s = seq_len-3; s < seq_len; ++s) {
+                    std::cout << "[ ";
+                    for (int h = 0; h < 3; ++h) {
+                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
+                    }
+                    std::cout << " ... ";
+                    for (int h = hidden_units_-3; h < hidden_units_; ++h) {
+                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
+                    }
+                    std::cout << "]\n";
+                }
+                std::cout << "]\n";
+            }
+            std::cout << "\n";
+            out += hidden_units_;
+        }
+
+        free(tmp);
+    }
+    */
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
         cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+        sync_check_cuda_error();
     }
+    PrefixPromptBatchWeightsParam<T> param;
     invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                    k_buf_2_,
                                    v_buf_2_,
@@ -105,30 +146,26 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    stream_);
     sync_check_cuda_error();
 
-    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
-    // Use batch major
-    // put k/v_buf from shape [B, H, L, Dh]
-    // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
-    // TODO: Cache implementation
-    // k_cache: [batch_size, num_heads, L, Dh]
-    // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh]
-    // v_buf: [batch_size, num_heads, L, Dh]
-
-    invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
-                                output_tensors->getPtr<T>("value_cache"),
-                                k_buf_2_,
-                                v_buf_2_,
-                                batch_size,
-                                seq_len,
-                                max_seq_len,
-                                size_per_head_,
-                                head_num_,
-                                stream_);
-    // IDEA : after this,
-    // k_cache = (batch_size, num_heads, Dh/x, L, x)
-    // v_cache = (batch_size, num_heads, L, Dh)
-    sync_check_cuda_error();
-    POP_RANGE;
+    //    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
+    //    // Use batch major
+    //    // put k/v_buf from shape [B, H, L, Dh]
+    //    // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
+    //    // TODO: Cache implementation
+    //    // k_cache: [batch_size, num_heads, L, Dh]
+    //    // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh]
+    //    // v_buf: [batch_size, num_heads, L, Dh]
+    //    invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
+    //                                output_tensors->getPtr<T>("value_cache"),
+    //                                k_buf_2_,
+    //                                v_buf_2_,
+    //                                batch_size,
+    //                                seq_len,
+    //                                max_seq_len,
+    //                                size_per_head_,
+    //                                head_num_,
+    //                                stream_);
+    //    sync_check_cuda_error();
+    //    POP_RANGE;
 
     if (attention_type == AttentionType::FUSED_MHA) {
         dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
@@ -136,8 +173,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     }
     else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = seq_len;              // q length
-        const int            attention_seq_len_2 = start_pos + seq_len;  // kv length
+        const int            attention_seq_len_1 = seq_len;  // q length
+        const int            attention_seq_len_2 = seq_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
 
         //
@@ -226,16 +263,21 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                             size_per_head_,
                                             attention_seq_len_1,
                                             attention_seq_len_2,
+
                                             v_buf_2_,
                                             size_per_head_,
                                             attention_seq_len_2 * size_per_head_,
+
                                             qk_buf_,
                                             attention_seq_len_2,
                                             attention_seq_len_1 * attention_seq_len_2,
+
                                             qkv_buf_2_,
                                             size_per_head_,
                                             attention_seq_len_1 * size_per_head_,
+
                                             batch_size * head_num_);
+        sync_check_cuda_error();
 
         // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
         if (padding_offset == nullptr) {
@@ -262,6 +304,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                      attention_weights->attention_output_weight.scale,
                                                      0,  // int8_mode
                                                      stream_);
+            sync_check_cuda_error();
         }
         POP_RANGE;
     }
@@ -279,6 +322,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           hidden_units_,
                           attention_out,
                           hidden_units_);
+    sync_check_cuda_error();
     POP_RANGE;
 
     if (is_free_buffer_after_forward_ == true) {
@@ -289,8 +333,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
-                                                          size_t           head_num,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
                                                           size_t           size_per_head,
                                                           cudaStream_t     stream,
                                                           cublasMMWrapper* cublas_wrapper,
@@ -307,8 +350,7 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
-                                                          size_t           head_num,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
                                                           size_t           size_per_head,
                                                           size_t           local_head_num,
                                                           cudaStream_t     stream,
@@ -328,8 +370,7 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
 }
 
 template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(
-                                                          size_t           head_num,
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
                                                           size_t           size_per_head,
                                                           size_t           local_head_num,
                                                           size_t           rotary_embedding_dim,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index e4c4e4ee8..d21302c61 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -23,8 +23,104 @@
 #include <algorithm>
 #include <type_traits>
 
+#include <iomanip>
+
 namespace fastertransformer {
 
+template<typename T>
+static void _print_tensor1(T* out, int dim1, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != dim1 - 1)
+            std::cout << ", ";
+    }
+    if (end0 != start1) {
+        std::cout << "..., ";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != end1 - 1)
+            std::cout << ", ";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor2(T* out, int dim1, int dim2, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor1(&out[i * dim2], dim2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << ",\n";
+    }
+    if (end0 != start1) {
+        std::cout << ind;
+        std::cout << "...,\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor1(&out[i * dim2], dim2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << ",\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << ",\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...,\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
+        if (i != end1 - 1)
+            std::cout << ",\n";
+    }
+    std::cout << "]\n";
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3)
+{
+    T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3);
+    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost);
+    _print_tensor3(out, dim1, dim2, dim3, 1);
+    free(out);
+}
+
 template<typename T>
 void LLaMA<T>::initialize()
 {
@@ -56,7 +152,7 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_;
 
     input_attention_mask_ =
-        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * max_seq_len, false));
+        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
     decoder_output_buf_ =
         (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
     normed_decoder_output_buf_ =
@@ -204,6 +300,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        const std::unordered_map<std::string, Tensor>* input_tensors,
                        const LLaMAWeight<T>*                          llama_weights)
 {
+    // Logger::getLogger().setLevel(Logger::Level::TRACE);
     // input_tensors:
     //      input_ids [batch_size, seq_len]
     //      input_lengths [batch_size]
@@ -224,7 +321,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     int seq_len = input_tensors->at("input_ids").shape[1];
 
     // max cache seq len should include max prefix prompt length as it has k/v states
-    const size_t         start_pos      = input_tensors->at("start_pos").max<uint32_t>();
+    const int            start_pos      = input_tensors->at("start_pos").max<int>();
     const cudaDataType_t gemm_data_type = getCudaDataType<T>();
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
@@ -243,7 +340,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     invokeTileGptInputs(tiled_input_ids_buf_,
                         tiled_input_lengths_buf_,
                         input_tensors->at("input_ids").getPtr<int>(),
-                        input_tensors->at("input_lengths").getPtr<const int>(),
+                        input_tensors->at("input_lengths").getPtr<int>(),
                         batch_size,
                         1,
                         seq_len,
@@ -255,19 +352,18 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
-        invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
-                                                 nullptr,
-                                                 llama_weights->pre_decoder_embedding_table,
-                                                 llama_weights->position_encoding_table,
-                                                 pPromptTuningParam<T>{},  // no p/prompt tuning
-                                                 tiled_input_ids_buf_,
-                                                 1,
-                                                 seq_len,
-                                                 seq_len,  // must be same
-                                                 batch_size,
-                                                 hidden_units_,
-                                                 stream_);
+        invokeInputIdsEmbeddingLookup(context_decoder_input_buf_,
+                                      llama_weights->pre_decoder_embedding_table,
+                                      tiled_input_ids_buf_,
+                                      seq_len,
+                                      batch_size,
+                                      hidden_units_,
+                                      stream_);
         sync_check_cuda_error();
+
+//        std::cout << 0 << "==================" << "EMBEDDING\n";
+//        print_tensor3(context_decoder_input_buf_, batch_size, seq_len, hidden_units_);
+//        std::cout << 0 << "==================" << "EMBEDDING\n";
     }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
@@ -276,7 +372,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}},
-        {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &start_pos}}};
+        {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}};
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
@@ -298,6 +394,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
+
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
                               vocab_size_,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 66fc30b6b..587118703 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -21,8 +21,104 @@
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
 
+#include <iomanip>
+
 namespace fastertransformer {
 
+template<typename T>
+static void _print_tensor1(T* out, int dim1, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != dim1 - 1)
+            std::cout << ", ";
+    }
+    if (end0 != start1) {
+        std::cout << "..., ";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != end1 - 1)
+            std::cout << ", ";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor2(T* out, int dim1, int dim2, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor1(&out[i * dim2], dim2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << ",\n";
+    }
+    if (end0 != start1) {
+        std::cout << ind;
+        std::cout << "...,\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor1(&out[i * dim2], dim2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << ",\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << ",\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...,\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
+        if (i != end1 - 1)
+            std::cout << ",\n";
+    }
+    std::cout << "]\n";
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3)
+{
+    T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3);
+    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost);
+    _print_tensor3(out, dim1, dim2, dim3, 1);
+    free(out);
+}
+
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
@@ -64,8 +160,6 @@ void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     self_attn_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    ffn_output_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     decoder_layer_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
@@ -81,7 +175,6 @@ void LLaMAContextDecoder<T>::freeBuffer()
     if (is_allocate_buffer_ == true) {
         allocator_->free((void**)(&decoder_normed_input_));
         allocator_->free((void**)(&self_attn_output_));
-        allocator_->free((void**)(&ffn_output_));
         allocator_->free((void**)(&decoder_layer_output_));
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
@@ -280,6 +373,13 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
             sync_check_cuda_error();
         }
+//        if (isFirstLayerParallelId(l)) {
+//            std::cout << l << "==================" << "RECV\n";
+//            print_tensor3(layer_input, batch_size, seq_len, hidden_units_);
+//            std::cout << l << "==================" << "RECV\n";
+//            std::cout << std::flush;
+//        }
+
 
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
                                     layer_input,
@@ -352,6 +452,14 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         sync_check_cuda_error();
 
+
+//        if (isLastLayerParallelId(l)) {
+//            std::cout << l << "==================" << "SEND\n";
+//            print_tensor3(layer_input, batch_size, seq_len, hidden_units_);
+//            std::cout << l << "==================" << "SEND\n";
+//            std::cout << std::flush;
+//        }
+
         if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index 452567208..cb6736f02 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -69,7 +69,6 @@ class LLaMAContextDecoder: public BaseLayer {
 protected:
     T*      decoder_normed_input_   = nullptr;
     T*      self_attn_output_       = nullptr;
-    T*      ffn_output_             = nullptr;
     T*      decoder_layer_output_   = nullptr;
     size_t* h_pinned_token_num_ptr_ = nullptr;
     int*    padding_offset_         = nullptr;
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 7be46c7ed..45c1e1575 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -88,7 +88,7 @@ LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t s
     const int  seq_len       = input_ids.size(1);
     th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_},
                                             torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
-    ftllama->forward(output_logits, input_ids, input_lengths, start_pos);
+    ftllama->forward(output_logits, input_ids, input_lengths, (int)start_pos);
     return output_logits;
 }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 0d97dc322..ab594c5c7 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -30,7 +30,7 @@ class IFLLaMA {
 public:
     virtual ~IFLLaMA() {}
     virtual void
-    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos) = 0;
+    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) = 0;
 };
 
 template<typename T>
@@ -117,7 +117,7 @@ class FTLLaMA: public IFLLaMA {
     virtual void forward(th::Tensor&   output_logits,
                          th::Tensor&   input_ids,
                          th::Tensor&   input_lengths,
-                         const int64_t start_pos) override
+                         const int start_pos) override
     {
         auto           stream       = at::cuda::getCurrentCUDAStream().stream();
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
@@ -171,7 +171,7 @@ class FTLLaMA: public IFLLaMA {
             {"input_lengths",
              ft::Tensor{
                  ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{request_batch_size}, get_ptr<int>(input_lengths)}},
-            {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_UINT32, std::vector<size_t>{1}, &start_pos}}};
+            {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &start_pos}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"output_logits",

From db6efddf6e3112548dc1ce8bbc6234b9646c6e57 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 24 Sep 2023 00:12:35 +0000
Subject: [PATCH 27/55] dump

---
 .../kernels/layernorm_kernels.cu              |  47 ++---
 .../kernels/layernorm_kernels.h               |   1 -
 .../LLaMAContextAttentionLayer.cc             |  73 ++++----
 src/fastertransformer/models/llama/LLaMA.cc   |  98 +---------
 .../models/llama/LLaMAContextDecoder.cc       | 138 +++------------
 src/fastertransformer/utils/llama_utils.h     | 167 ++++++++++++++++++
 6 files changed, 232 insertions(+), 292 deletions(-)
 create mode 100644 src/fastertransformer/utils/llama_utils.h

diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
index 6244dbfd6..80a656cf7 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.cu
+++ b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -2511,56 +2511,36 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16*       out,
 /*******************  invokeGeneralLLaMALayerNorm  ***********************/
 
 template<typename T>
-__global__ void generalLLaMALayerNorm(const T* __restrict input,
-                                      const T* __restrict gamma,
-                                      const T* __restrict beta,
-                                      T*          normed_output,
-                                      const float layernorm_eps,
-                                      int         m,
-                                      int         n)
+__global__ void generalLLaMALayerNorm(
+    const T* __restrict input, const T* __restrict gamma, T* normed_output, const float layernorm_eps, int m, int n)
 {
     const int tid = threadIdx.x;
 
-    extern __shared__ __align__(sizeof(float)) char _shmem[];
-    T*                                              shmem = reinterpret_cast<T*>(_shmem);
-
-    __shared__ float s_variance;
-    float            variance = 0.0f;
-
-    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
-    using Scalar_T       = typename packed_as<T, 1>::type;
-
     float local_var_sum = 0.0f;
     for (int i = tid; i < n; i += blockDim.x) {
         float val = (float)(ldg(&input[blockIdx.x * n + i]));
         local_var_sum += val * val;
     }
 
-    variance = blockReduceSum(local_var_sum);
+    float variance = 0.0f;
+    variance       = blockReduceSum(local_var_sum);
 
+    __shared__ float s_variance;
     if (threadIdx.x == 0) {
-        s_variance = rsqrtf(variance / (float)n + layernorm_eps);
+        s_variance = rsqrtf((variance / (float)n) + layernorm_eps);
     }
     __syncthreads();
 
     for (int i = tid; i < n; i += blockDim.x) {
-        const int index    = blockIdx.x * n + i;
-        float     beta_val = (beta == nullptr) ? 0.0f : (float)ldg(&beta[i]);
-        T         val      = (T)(((float)input[index] * s_variance) * (float)(ldg(&gamma[i])) + beta_val);
-
-        normed_output[index] = val;
+        const int index = blockIdx.x * n + i;
+        T         val   = (T) (((float)ldg(&input[index])) * s_variance);
+        normed_output[index] = val * ldg(&gamma[i]);
     }
 }
 
 template<typename T>
-void invokeGeneralLLaMALayerNorm(T*           out,
-                                 const T*     input,
-                                 const T*     gamma,
-                                 const T*     beta,
-                                 const float  layernorm_eps,
-                                 const int    m,
-                                 const int    n,
-                                 cudaStream_t stream)
+void invokeGeneralLLaMALayerNorm(
+    T* out, const T* input, const T* gamma, const float layernorm_eps, const int m, const int n, cudaStream_t stream)
 {
     dim3 grid(m);
     dim3 block(min(n, 1024));
@@ -2572,13 +2552,12 @@ void invokeGeneralLLaMALayerNorm(T*           out,
         block.x = 1024;
     }
 
-    generalLLaMALayerNorm<T><<<grid, block, 0, stream>>>(input, gamma, beta, out, layernorm_eps, m, n);
+    generalLLaMALayerNorm<T><<<grid, block, 0, stream>>>(input, gamma, out, layernorm_eps, m, n);
 }
 
 template void invokeGeneralLLaMALayerNorm(float*       out,
                                           const float* input,
                                           const float* gamma,
-                                          const float* beta,
                                           const float  layernorm_eps,
                                           const int    m,
                                           const int    n,
@@ -2586,7 +2565,6 @@ template void invokeGeneralLLaMALayerNorm(float*       out,
 template void invokeGeneralLLaMALayerNorm(half*        out,
                                           const half*  input,
                                           const half*  gamma,
-                                          const half*  beta,
                                           const float  layernorm_eps,
                                           const int    m,
                                           const int    n,
@@ -2595,7 +2573,6 @@ template void invokeGeneralLLaMALayerNorm(half*        out,
 template void invokeGeneralLLaMALayerNorm(__nv_bfloat16*       out,
                                           const __nv_bfloat16* input,
                                           const __nv_bfloat16* gamma,
-                                          const __nv_bfloat16* beta,
                                           const float          layernorm_eps,
                                           const int            m,
                                           const int            n,
diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h
index 8fb8ecf8b..c7b31e874 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.h
+++ b/src/fastertransformer/kernels/layernorm_kernels.h
@@ -180,7 +180,6 @@ template<typename T>
 void invokeGeneralLLaMALayerNorm(T*           out,
                                  const T*     input,
                                  const T*     gamma,
-                                 const T*     beta,
                                  const float  layernorm_eps,
                                  const int    m,
                                  const int    n,
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index a1cb9b81f..151a1012c 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -18,6 +18,7 @@
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
 #include "src/fastertransformer/kernels/layernorm_kernels.h"
 #include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/utils/llama_utils.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 
 namespace fastertransformer {
@@ -79,58 +80,25 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-    /*
-    if (layer_id < 15) {
-        T* out = (T*)malloc(sizeof(T) * m * 3 * hidden_units_);
-        T *tmp = out;
-        cudaMemcpy(
-            out, qkv_buf_, sizeof(T) * m * 3 * hidden_units_, cudaMemcpyDeviceToHost);
-        for (int i = 0; i < 3; ++i) {
-            for (int b = 0; b < batch_size; ++b) {
-                std::cout << "[\n";
-                for (int s = 0; s < 3; ++s) {
-                    std::cout << "[ ";
-                    for (int h = 0; h < 3; ++h) {
-                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
-                    }
-                    std::cout << " ... ";
-                    for (int h = hidden_units_-3; h < hidden_units_; ++h) {
-                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
-                    }
-                    std::cout << "]\n";
-                }
-                std::cout << "...\n";
-                for (int s = seq_len-3; s < seq_len; ++s) {
-                    std::cout << "[ ";
-                    for (int h = 0; h < 3; ++h) {
-                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
-                    }
-                    std::cout << " ... ";
-                    for (int h = hidden_units_-3; h < hidden_units_; ++h) {
-                        std::cout << out[b * seq_len * 3 * hidden_units_ + s * 3 * hidden_units_ + h] << " ";
-                    }
-                    std::cout << "]\n";
-                }
-                std::cout << "]\n";
-            }
-            std::cout << "\n";
-            out += hidden_units_;
-        }
-
-        free(tmp);
+    if (true) {
+        print_tensor3(qkv_buf_,
+                      batch_size,
+                      seq_len,
+                      hidden_units_,
+                      seq_len * hidden_units_ * 3,
+                      hidden_units_ * 3,
+                      batch_size * seq_len * hidden_units_ * 3,
+                      2*hidden_units_);
     }
-    */
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
         cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
         sync_check_cuda_error();
     }
-    PrefixPromptBatchWeightsParam<T> param;
     invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                    k_buf_2_,
                                    v_buf_2_,
-                                   param,  // prefix prompt
                                    qkv_buf_,
                                    attention_weights->query_weight.bias,
                                    padding_offset,
@@ -139,12 +107,31 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    m,
                                    head_num_,
                                    size_per_head_,
-                                   rotary_embedding_dim_,
+                                   stream_);
+    /*
+    invokeAddFusedQKVBiasTranspose(q_buf_2_,
+                                   k_buf_2_,
+                                   v_buf_2_,
+                                   PrefixPromptBatchWeightsParam<T>{},
+                                   qkv_buf_,
+                                   attention_weights->query_weight.bias,
+                                   padding_offset,
+                                   batch_size,
+                                   seq_len,
+                                   m,
+                                   head_num_,
+                                   size_per_head_,
+                                   //rotary_embedding_dim_,
+                                   0,
                                    false,
                                    attention_weights->query_weight.scale_out,
                                    0,  // int8_mode
                                    stream_);
+                                   */
     sync_check_cuda_error();
+    //    if (true) {
+    //        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    //    }
 
     //    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     //    // Use batch major
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index d21302c61..cdf8071e6 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -20,107 +20,12 @@
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
 #include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/llama_utils.h"
 #include <algorithm>
 #include <type_traits>
 
-#include <iomanip>
-
 namespace fastertransformer {
 
-template<typename T>
-static void _print_tensor1(T* out, int dim1, int indent)
-{
-    std::string ind(indent, ' ');
-    int         start0 = 0;
-    int         end0   = (dim1 < 3) ? dim1 : 3;
-    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int         end1   = (dim1 < 3) ? 0 : dim1;
-
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
-        if (i != dim1 - 1)
-            std::cout << ", ";
-    }
-    if (end0 != start1) {
-        std::cout << "..., ";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
-        if (i != end1 - 1)
-            std::cout << ", ";
-    }
-    std::cout << "]";
-}
-
-template<typename T>
-static void _print_tensor2(T* out, int dim1, int dim2, int indent)
-{
-    std::string ind(indent, ' ');
-    int         start0 = 0;
-    int         end0   = (dim1 < 3) ? dim1 : 3;
-    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int         end1   = (dim1 < 3) ? 0 : dim1;
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        if (i != start0)
-            std::cout << ind;
-        _print_tensor1(&out[i * dim2], dim2, indent + 1);
-        if (i != dim1 - 1)
-            std::cout << ",\n";
-    }
-    if (end0 != start1) {
-        std::cout << ind;
-        std::cout << "...,\n";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << ind;
-        _print_tensor1(&out[i * dim2], dim2, indent + 1);
-        if (i != end1 - 1)
-            std::cout << ",\n";
-    }
-    std::cout << "]";
-}
-
-template<typename T>
-static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent)
-{
-    std::string ind(indent, ' ');
-
-    int start0 = 0;
-    int end0   = (dim1 < 3) ? dim1 : 3;
-    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int end1   = (dim1 < 3) ? 0 : dim1;
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        if (i != start0)
-            std::cout << ind;
-        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
-        if (i != dim1 - 1)
-            std::cout << ",\n\n";
-    }
-    if (start1 != end1) {
-        std::cout << ind;
-        std::cout << "...,\n";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << ind;
-        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
-        if (i != end1 - 1)
-            std::cout << ",\n";
-    }
-    std::cout << "]\n";
-}
-
-template<typename T>
-static void print_tensor3(T* in, int dim1, int dim2, int dim3)
-{
-    T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3);
-    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost);
-    _print_tensor3(out, dim1, dim2, dim3, 1);
-    free(out);
-}
-
 template<typename T>
 void LLaMA<T>::initialize()
 {
@@ -388,7 +293,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
-                                    llama_weights->post_decoder_layernorm.beta,
                                     layernorm_eps_,
                                     batch_size * seq_len,
                                     hidden_units_,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 587118703..275c61ad3 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -20,105 +20,10 @@
 
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
-
-#include <iomanip>
+#include "src/fastertransformer/utils/llama_utils.h"
 
 namespace fastertransformer {
 
-template<typename T>
-static void _print_tensor1(T* out, int dim1, int indent)
-{
-    std::string ind(indent, ' ');
-    int         start0 = 0;
-    int         end0   = (dim1 < 3) ? dim1 : 3;
-    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int         end1   = (dim1 < 3) ? 0 : dim1;
-
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
-        if (i != dim1 - 1)
-            std::cout << ", ";
-    }
-    if (end0 != start1) {
-        std::cout << "..., ";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
-        if (i != end1 - 1)
-            std::cout << ", ";
-    }
-    std::cout << "]";
-}
-
-template<typename T>
-static void _print_tensor2(T* out, int dim1, int dim2, int indent)
-{
-    std::string ind(indent, ' ');
-    int         start0 = 0;
-    int         end0   = (dim1 < 3) ? dim1 : 3;
-    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int         end1   = (dim1 < 3) ? 0 : dim1;
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        if (i != start0)
-            std::cout << ind;
-        _print_tensor1(&out[i * dim2], dim2, indent + 1);
-        if (i != dim1 - 1)
-            std::cout << ",\n";
-    }
-    if (end0 != start1) {
-        std::cout << ind;
-        std::cout << "...,\n";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << ind;
-        _print_tensor1(&out[i * dim2], dim2, indent + 1);
-        if (i != end1 - 1)
-            std::cout << ",\n";
-    }
-    std::cout << "]";
-}
-
-template<typename T>
-static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int indent)
-{
-    std::string ind(indent, ' ');
-
-    int start0 = 0;
-    int end0   = (dim1 < 3) ? dim1 : 3;
-    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
-    int end1   = (dim1 < 3) ? 0 : dim1;
-    std::cout << "[";
-    for (int i = start0; i < end0; ++i) {
-        if (i != start0)
-            std::cout << ind;
-        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
-        if (i != dim1 - 1)
-            std::cout << ",\n\n";
-    }
-    if (start1 != end1) {
-        std::cout << ind;
-        std::cout << "...,\n";
-    }
-    for (int i = start1; i < end1; ++i) {
-        std::cout << ind;
-        _print_tensor2(&out[i * dim2 * dim3], dim2, dim3, indent + 1);
-        if (i != end1 - 1)
-            std::cout << ",\n";
-    }
-    std::cout << "]\n";
-}
-
-template<typename T>
-static void print_tensor3(T* in, int dim1, int dim2, int dim3)
-{
-    T* out = (T*)malloc(sizeof(T) * dim1 * dim2 * dim3);
-    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2 * dim3, cudaMemcpyDeviceToHost);
-    _print_tensor3(out, dim1, dim2, dim3, 1);
-    free(out);
-}
-
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
@@ -351,8 +256,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
 
         if (l == 0 && is_unpadded_mha) {
-            invokeRemovePadding(
-                decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
+            invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
             sync_check_cuda_error();
         }
 
@@ -373,24 +277,23 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
             sync_check_cuda_error();
         }
-//        if (isFirstLayerParallelId(l)) {
-//            std::cout << l << "==================" << "RECV\n";
-//            print_tensor3(layer_input, batch_size, seq_len, hidden_units_);
-//            std::cout << l << "==================" << "RECV\n";
-//            std::cout << std::flush;
-//        }
-
 
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
                                     layer_input,
                                     llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
                                     layernorm_eps_,
                                     h_token_num,
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
 
+        if (true) {
+            std::cout << l << "==================" << "ATTN_NORM\n";
+            print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_);
+            std::cout << l << "==================" << "ATTN_NORM\n";
+            std::cout << std::flush;
+        }
+
         TensorMap self_attention_input_tensors{
             {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
             {"attention_mask",
@@ -419,13 +322,24 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
+        std::cout << l << "==================" << "QBUF\n";
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
+        std::cout << l << "==================" << "QBUF\n";
+        std::cout << std::flush;
+
+//        if (true) {
+//            std::cout << l << "==================" << "ATTENTION\n";
+//            print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_);
+//            std::cout << l << "==================" << "ATTENTION\n";
+//            std::cout << std::flush;
+//        }
+
 
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
-            layer_input,
+            decoder_normed_input_,
             self_attn_output_,
             layer_input,
             llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
@@ -438,7 +352,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         sync_check_cuda_error();
 
         TensorMap ffn_input_tensors(
-            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_input}}});
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
         TensorMap ffn_output_tensors(
             {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
         ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
@@ -452,14 +366,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         sync_check_cuda_error();
 
-
-//        if (isLastLayerParallelId(l)) {
-//            std::cout << l << "==================" << "SEND\n";
-//            print_tensor3(layer_input, batch_size, seq_len, hidden_units_);
-//            std::cout << l << "==================" << "SEND\n";
-//            std::cout << std::flush;
-//        }
-
         if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
new file mode 100644
index 000000000..c1c2632c7
--- /dev/null
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -0,0 +1,167 @@
+#include <cuda_runtime.h>
+#include <iomanip>
+#include <iostream>
+
+namespace fastertransformer {
+
+template<typename T>
+static void _print_tensor1(T* out, int dim1, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != dim1 - 1)
+            std::cout << " ";
+    }
+    if (end0 != start1) {
+        std::cout << "... ";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != end1 - 1)
+            std::cout << " ";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor2(T* out, int dim1, int dim2, int stride, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor1(&out[i * stride], dim2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n";
+    }
+    if (end0 != start1) {
+        std::cout << ind;
+        std::cout << "...\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor1(&out[i * stride], dim2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, int stride2, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...\n\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n";
+    }
+    std::cout << "]\n";
+}
+
+template<typename T>
+static void
+_print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...\n\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n";
+    }
+    std::cout << "]\n";
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start)
+{
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1);
+
+    /*
+    if (stride2 != dim3) {
+        for (int i = dim1 * dim2 * 3 * dim3 - 1 * dim3 - 8; i < dim1 * dim2 * 3 * dim3 - 1 * dim3; ++i) {
+            std::cout << out[i] << " ";
+        }
+        std::cout << "\n";
+    }
+    */
+    free(out);
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3)
+{
+    print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0);
+}
+
+template<typename T>
+static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4)
+{
+    print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4);
+}
+
+template<typename T>
+static void
+print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start)
+{
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1);
+    for (int i = dim1 * dim2 * dim3 * dim4 - 8; i < dim1 * dim2 * dim3 * dim4; ++i) {
+        std::cout << out[i] << " ";
+    }
+    std::cout << "\n";
+    free(out);
+}
+
+}  // namespace fastertransformer

From 6e099590f0e670a1eaa4272789702982ce02586c Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 24 Sep 2023 00:41:15 +0000
Subject: [PATCH 28/55] dmpdmp

---
 .../kernels/unfused_attention_kernels.cu      | 121 ++++++++++++++++--
 .../kernels/unfused_attention_kernels.h       |  13 ++
 .../LLaMAContextAttentionLayer.cc             |  54 ++++----
 src/fastertransformer/utils/llama_utils.h     |  12 +-
 4 files changed, 156 insertions(+), 44 deletions(-)

diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 61d2a54ff..3e729b7e0 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -278,14 +278,14 @@ __global__ void softmax_kernel(T*          attn_score,
     // Loop along with Q dimension.
     for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
 
-        float data[ITEMS_PER_THREAD];
-        int64_t   qk_offset;
-        float local_max = -1e20f;
+        float   data[ITEMS_PER_THREAD];
+        int64_t qk_offset;
+        float   local_max = -1e20f;
 
         // Loop along with K dimension.
         for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
-            int64_t ki    = blockDim.x * i + threadIdx.x;  // Index of K dimension.
-            qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
+            int64_t ki = blockDim.x * i + threadIdx.x;  // Index of K dimension.
+            qk_offset  = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
 
             float qk_val  = static_cast<float>(qk[qk_offset]);
             float qk_bias = 0.0f;
@@ -297,8 +297,8 @@ __global__ void softmax_kernel(T*          attn_score,
                 qk_bias += static_cast<float>(linear_bias_slope * (ki - qi));
             }
 
-            int64_t   mask_offset = (bi * q_length + qi) * k_length + ki;
-            float mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
+            int64_t mask_offset = (bi * q_length + qi) * k_length + ki;
+            float   mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
             qk_bias += (1.0f - mask_val) * -10000.0f;
 
             data[i]   = qk_scale * qk_val + qk_bias;
@@ -1363,8 +1363,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int head_idx = blockIdx.y;
     const int tidx     = threadIdx.x;
 
-    const int total_seq_len = param.max_prefix_prompt_length + seq_len;
-    const bool is_masked = tidx * vec_size >= size_per_head;
+    const int  total_seq_len = param.max_prefix_prompt_length + seq_len;
+    const bool is_masked     = tidx * vec_size >= size_per_head;
 
     // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts
     if (PREFIX_PROMPT && token_idx < 0) {
@@ -1581,6 +1581,109 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16);
 #endif
 #undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSE
 
+template<typename T>
+__global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
+                                                         T*         k_buf,
+                                                         T*         v_buf,
+                                                         T*         QKV,
+                                                         const int* padding_offset,
+                                                         const int  batch_size,
+                                                         const int  seq_len,
+                                                         const int  token_num,
+                                                         const int  head_num,
+                                                         const int  size_per_head)
+{
+    // QKV: [token_num, 3, n]
+    // qkv_bias: [3, n]
+    // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head]
+
+    T*        qkv_ptr[3] = {q_buf, k_buf, v_buf};
+    const int n          = head_num * size_per_head;
+    for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < token_num * 3 * n;
+         index += gridDim.x * blockDim.x) {
+
+        const int token_idx        = index / (3 * n);
+        const int token_padded_idx = token_idx + (padding_offset == nullptr ? 0 : padding_offset[token_idx]);
+        const int target_batch_id  = token_padded_idx / seq_len;
+        const int seq_id           = token_padded_idx % seq_len;
+
+        const int qkv_id  = (index % (3 * n)) / n;
+        const int head_id = (index % n) / size_per_head;
+        const int size_id = index % size_per_head;
+
+        T val                                               = ldg(&QKV[index]);
+        QKV[index]                                          = val;
+        qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head
+                        + seq_id * size_per_head + size_id] = val;
+    }
+}
+
+template<typename T>
+void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
+                                         T*           k_buf,
+                                         T*           v_buf,
+                                         T*           QKV,
+                                         const int*   padding_offset,
+                                         const int    batch_size,
+                                         const int    seq_len,
+                                         const int    token_num,
+                                         const int    head_num,
+                                         const int    size_per_head,
+                                         cudaStream_t stream)
+{
+    const int m = token_num;
+    const int n = head_num * size_per_head;
+    dim3      block(384);
+    dim3      grid((int)(ceil(1.0 * m * n / 384)));
+    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(q_buf,
+                                                                         k_buf,
+                                                                         v_buf,
+                                                                         QKV,
+                                                                         padding_offset,
+                                                                         batch_size,
+                                                                         seq_len,
+                                                                         token_num,
+                                                                         head_num,
+                                                                         size_per_head);
+}
+
+template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
+                                                  float*       k_buf,
+                                                  float*       v_buf,
+                                                  float*       QKV,
+                                                  const int*   padding_offset,
+                                                  const int    batch_size,
+                                                  const int    seq_len,
+                                                  const int    token_num,
+                                                  const int    head_num,
+                                                  const int    size_per_head,
+                                                  cudaStream_t stream);
+
+template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
+                                                  half*        k_buf,
+                                                  half*        v_buf,
+                                                  half*        QKV,
+                                                  const int*   padding_offset,
+                                                  const int    batch_size,
+                                                  const int    seq_len,
+                                                  const int    token_num,
+                                                  const int    head_num,
+                                                  const int    size_per_head,
+                                                  cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
+                                                  __nv_bfloat16* k_buf,
+                                                  __nv_bfloat16* v_buf,
+                                                  __nv_bfloat16* QKV,
+                                                  const int*     padding_offset,
+                                                  const int      batch_size,
+                                                  const int      seq_len,
+                                                  const int      token_num,
+                                                  const int      head_num,
+                                                  const int      size_per_head,
+                                                  cudaStream_t   stream);
+#endif
+
 template<typename T>
 __global__ void transpose_4d(T*        dst,
                              T*        src,
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 7ac7604d4..5f8cd0669 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -113,6 +113,19 @@ struct PrefixPromptBatchWeightsParam {
     const size_t prefix_prompt_layer_offset_per_seq = 0;
 };
 
+template<typename T>
+void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
+                                         T*           k_buf,
+                                         T*           v_buf,
+                                         T*           QKV,
+                                         const int*   padding_offset,
+                                         const int    batch_size,
+                                         const int    seq_len,
+                                         const int    token_num,
+                                         const int    head_num,
+                                         const int    size_per_head,
+                                         cudaStream_t stream);
+
 template<typename T>
 void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                     T*           k_buf,
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 151a1012c..dbf447707 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -80,34 +80,30 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-    if (true) {
-        print_tensor3(qkv_buf_,
-                      batch_size,
-                      seq_len,
-                      hidden_units_,
-                      seq_len * hidden_units_ * 3,
-                      hidden_units_ * 3,
-                      batch_size * seq_len * hidden_units_ * 3,
-                      2*hidden_units_);
-    }
-
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
         cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
         sync_check_cuda_error();
     }
-    invokeAddFusedQKVBiasTranspose(q_buf_2_,
-                                   k_buf_2_,
-                                   v_buf_2_,
-                                   qkv_buf_,
-                                   attention_weights->query_weight.bias,
-                                   padding_offset,
-                                   batch_size,
-                                   seq_len,
-                                   m,
-                                   head_num_,
-                                   size_per_head_,
-                                   stream_);
+    invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,                         
+                                        k_buf_2_,
+                                        v_buf_2_,
+                                        qkv_buf_,
+                                        nullptr, // padding_offset,
+                                        batch_size,
+                                        seq_len,
+                                        m,
+                                        head_num_,
+                                        size_per_head_,
+                                        stream_);
+    if (true) {
+        std::cout << "batch_size: " << batch_size << "\n";
+        std::cout << "head_num_: " << head_num_ << "\n";
+        std::cout << "seq_len: " << seq_len << "\n";
+        std::cout << "size_per_head_: " << size_per_head_ << "\n";
+        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    }
+
     /*
     invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                    k_buf_2_,
@@ -408,26 +404,26 @@ template<typename T>
 void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false);
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true);
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false);
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
         qk_buf_ = nullptr;
     }
-    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
 
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
index c1c2632c7..8f6dcf5ff 100644
--- a/src/fastertransformer/utils/llama_utils.h
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -144,12 +144,6 @@ static void print_tensor3(T* in, int dim1, int dim2, int dim3)
     print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0);
 }
 
-template<typename T>
-static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4)
-{
-    print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4);
-}
-
 template<typename T>
 static void
 print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start)
@@ -164,4 +158,10 @@ print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int st
     free(out);
 }
 
+template<typename T>
+static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4)
+{
+    print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4, 0);
+}
+
 }  // namespace fastertransformer

From cf8087a4d03be32932b606f13c17f2c3658c122b Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 24 Sep 2023 00:41:58 +0000
Subject: [PATCH 29/55] dp

---
 .../layers/attention_layers/LLaMAContextAttentionLayer.cc     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index dbf447707..b00c8b991 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -97,10 +97,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         size_per_head_,
                                         stream_);
     if (true) {
-        std::cout << "batch_size: " << batch_size << "\n";
-        std::cout << "head_num_: " << head_num_ << "\n";
-        std::cout << "seq_len: " << seq_len << "\n";
-        std::cout << "size_per_head_: " << size_per_head_ << "\n";
         print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
     }
 

From 13478f430048337d875aec015a4eb81189e80730 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 25 Sep 2023 09:30:35 +0000
Subject: [PATCH 30/55] dmp

---
 .../kernels/unfused_attention_kernels.cu      | 78 +++++++++-------
 .../kernels/unfused_attention_kernels.h       |  1 +
 .../LLaMAContextAttentionLayer.cc             | 93 +++++++++++++------
 .../models/llama/LLaMAContextDecoder.cc       | 37 ++++----
 src/fastertransformer/utils/llama_utils.h     | 23 ++---
 5 files changed, 136 insertions(+), 96 deletions(-)

diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 3e729b7e0..97df58261 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1589,32 +1589,50 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
                                                          const int* padding_offset,
                                                          const int  batch_size,
                                                          const int  seq_len,
-                                                         const int  token_num,
                                                          const int  head_num,
-                                                         const int  size_per_head)
+                                                         const int  size_per_head,
+                                                         const int  rotary_embedding_dim)
 {
-    // QKV: [token_num, 3, n]
-    // qkv_bias: [3, n]
-    // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head]
+    constexpr int vec_size         = Vec_t<T>::size;
+    using Vec_t                    = typename Vec_t<T>::Type;
+    const int token_idx            = blockIdx.x;
+    const int token_padding_offset = (padding_offset == nullptr || token_idx < 0) ? 0 : padding_offset[token_idx];
+    const int tgt_token_idx        = token_idx + token_padding_offset;
 
-    T*        qkv_ptr[3] = {q_buf, k_buf, v_buf};
+    const int batch_idx = tgt_token_idx / seq_len;
+    const int seq_idx   = tgt_token_idx % seq_len;
+
+    const int head_idx = blockIdx.y;
+    const int tidx     = threadIdx.x;
+
+    const bool is_masked = tidx * vec_size >= size_per_head;
+
+    const int hidden_idx = head_idx * size_per_head + tidx * vec_size;
     const int n          = head_num * size_per_head;
-    for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < token_num * 3 * n;
-         index += gridDim.x * blockDim.x) {
 
-        const int token_idx        = index / (3 * n);
-        const int token_padded_idx = token_idx + (padding_offset == nullptr ? 0 : padding_offset[token_idx]);
-        const int target_batch_id  = token_padded_idx / seq_len;
-        const int seq_id           = token_padded_idx % seq_len;
+    const int src_q_idx      = token_idx * 3 * n + hidden_idx;
+    const int src_k_idx      = token_idx * 3 * n + hidden_idx + n;
+    const int src_v_idx      = token_idx * 3 * n + hidden_idx + 2 * n;
 
-        const int qkv_id  = (index % (3 * n)) / n;
-        const int head_id = (index % n) / size_per_head;
-        const int size_id = index % size_per_head;
+    Vec_t q, k, v;
+    if (!is_masked) {
+        q = *reinterpret_cast<const Vec_t*>(&QKV[src_q_idx]);
+        k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
+        v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
+    }
 
-        T val                                               = ldg(&QKV[index]);
-        QKV[index]                                          = val;
-        qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head
-                        + seq_id * size_per_head + size_id] = val;
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, seq_idx);
+
+    const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
+                           + seq_idx * size_per_head + tidx * vec_size;
+
+    const int dest_kv_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
+                            + seq_idx * size_per_head + tidx * vec_size;
+
+    if (!is_masked) {
+        *reinterpret_cast<Vec_t*>(&q_buf[dest_q_idx])  = q;
+        *reinterpret_cast<Vec_t*>(&k_buf[dest_kv_idx]) = k;
+        *reinterpret_cast<Vec_t*>(&v_buf[dest_kv_idx]) = v;
     }
 }
 
@@ -1629,22 +1647,13 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    token_num,
                                          const int    head_num,
                                          const int    size_per_head,
+                                         const int    rotary_embedding_dim,
                                          cudaStream_t stream)
 {
-    const int m = token_num;
-    const int n = head_num * size_per_head;
-    dim3      block(384);
-    dim3      grid((int)(ceil(1.0 * m * n / 384)));
-    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(q_buf,
-                                                                         k_buf,
-                                                                         v_buf,
-                                                                         QKV,
-                                                                         padding_offset,
-                                                                         batch_size,
-                                                                         seq_len,
-                                                                         token_num,
-                                                                         head_num,
-                                                                         size_per_head);
+    dim3   block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
+    dim3   grid(token_num, head_num);
+    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(
+        q_buf, k_buf, v_buf, QKV, padding_offset, batch_size, seq_len, head_num, size_per_head, rotary_embedding_dim);
 }
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
@@ -1657,6 +1666,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
                                                   const int    token_num,
                                                   const int    head_num,
                                                   const int    size_per_head,
+                                                  const int    rotary_embedding_dim,
                                                   cudaStream_t stream);
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
@@ -1669,6 +1679,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
                                                   const int    token_num,
                                                   const int    head_num,
                                                   const int    size_per_head,
+                                                  const int    rotary_embedding_dim,
                                                   cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
@@ -1681,6 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
                                                   const int      token_num,
                                                   const int      head_num,
                                                   const int      size_per_head,
+                                                  const int      rotary_embedding_dim,
                                                   cudaStream_t   stream);
 #endif
 
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 5f8cd0669..0ccf64d8c 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -124,6 +124,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    token_num,
                                          const int    head_num,
                                          const int    size_per_head,
+                                         const int    rotary_embedding_dim,
                                          cudaStream_t stream);
 
 template<typename T>
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index b00c8b991..5aecc5de6 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -80,50 +80,83 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
+//    if (true) {
+//        print_tensor3(qkv_buf_,
+//                      batch_size,
+//                      seq_len,
+//                      hidden_units_,
+//                      seq_len * 3 * hidden_units_,
+//                      3 * hidden_units_,
+//                      batch_size * seq_len * 3 * hidden_units_,
+//                      0);
+//        print_tensor3(qkv_buf_,
+//                      batch_size,
+//                      seq_len,
+//                      hidden_units_,
+//                      seq_len * 3 * hidden_units_,
+//                      3 * hidden_units_,
+//                      batch_size * seq_len * 3 * hidden_units_,
+//                      hidden_units_);
+//        print_tensor3(qkv_buf_,
+//                      batch_size,
+//                      seq_len,
+//                      hidden_units_,
+//                      seq_len * 3 * hidden_units_,
+//                      3 * hidden_units_,
+//                      batch_size * seq_len * 3 * hidden_units_,
+//                      2*hidden_units_);
+//    }
+//    if (true) {
+//        print_tensor4(qkv_buf_,
+//                batch_size, seq_len, head_num_, size_per_head_,
+//                seq_len * 3 * head_num_ * size_per_head_,
+//                3 * head_num_ * size_per_head_,
+//                size_per_head_,
+//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+//                0
+//                );
+//        print_tensor4(qkv_buf_,
+//                batch_size, seq_len, head_num_, size_per_head_,
+//                seq_len * 3 * head_num_ * size_per_head_,
+//                3 * head_num_ * size_per_head_,
+//                size_per_head_,
+//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+//                head_num_ * size_per_head_
+//                );
+//        print_tensor4(qkv_buf_,
+//                batch_size, seq_len, head_num_, size_per_head_,
+//                seq_len * 3 * head_num_ * size_per_head_,
+//                3 * head_num_ * size_per_head_,
+//                size_per_head_,
+//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+//                2 * head_num_ * size_per_head_
+//                );
+//    }
+
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+        cudaMemsetAsync(k_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
         sync_check_cuda_error();
     }
-    invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,                         
+    invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
                                         k_buf_2_,
                                         v_buf_2_,
                                         qkv_buf_,
-                                        nullptr, // padding_offset,
+                                        padding_offset,
                                         batch_size,
                                         seq_len,
                                         m,
                                         head_num_,
                                         size_per_head_,
+                                        rotary_embedding_dim_,
                                         stream_);
-    if (true) {
-        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    }
-
-    /*
-    invokeAddFusedQKVBiasTranspose(q_buf_2_,
-                                   k_buf_2_,
-                                   v_buf_2_,
-                                   PrefixPromptBatchWeightsParam<T>{},
-                                   qkv_buf_,
-                                   attention_weights->query_weight.bias,
-                                   padding_offset,
-                                   batch_size,
-                                   seq_len,
-                                   m,
-                                   head_num_,
-                                   size_per_head_,
-                                   //rotary_embedding_dim_,
-                                   0,
-                                   false,
-                                   attention_weights->query_weight.scale_out,
-                                   0,  // int8_mode
-                                   stream_);
-                                   */
     sync_check_cuda_error();
-    //    if (true) {
-    //        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    //    }
+//    if (true) {
+//        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+//        print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+//        print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+//    }
+
 
     //    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     //    // Use batch major
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 275c61ad3..9ae318554 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -256,7 +256,8 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
 
         if (l == 0 && is_unpadded_mha) {
-            invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
+            invokeRemovePadding(
+                decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
             sync_check_cuda_error();
         }
 
@@ -287,10 +288,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                     stream_);
         sync_check_cuda_error();
 
-        if (true) {
-            std::cout << l << "==================" << "ATTN_NORM\n";
+        if (false) {
+            std::cout << l << "=================="
+                      << "ATTN_NORM\n";
             print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_);
-            std::cout << l << "==================" << "ATTN_NORM\n";
+            std::cout << l << "=================="
+                      << "ATTN_NORM\n";
             std::cout << std::flush;
         }
 
@@ -322,20 +325,21 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
-        std::cout << l << "==================" << "QBUF\n";
+        //        std::cout << l << "==================" << "QBUF\n";
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
-        std::cout << l << "==================" << "QBUF\n";
-        std::cout << std::flush;
-
-//        if (true) {
-//            std::cout << l << "==================" << "ATTENTION\n";
-//            print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_);
-//            std::cout << l << "==================" << "ATTENTION\n";
-//            std::cout << std::flush;
-//        }
-
+        //        std::cout << l << "==================" << "QBUF\n";
+        //        std::cout << std::flush;
+
+        if (false) {
+            std::cout << l << "=================="
+                      << "ATTENTION\n";
+            print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_);
+            std::cout << l << "=================="
+                      << "ATTENTION\n";
+            std::cout << std::flush;
+        }
 
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
@@ -352,7 +356,8 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         sync_check_cuda_error();
 
         TensorMap ffn_input_tensors(
-            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
+            {{"ffn_input",
+              Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
         TensorMap ffn_output_tensors(
             {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
         ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
index 8f6dcf5ff..a840c4749 100644
--- a/src/fastertransformer/utils/llama_utils.h
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -84,9 +84,9 @@ static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, in
         std::cout << ind;
         _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1);
         if (i != end1 - 1)
-            std::cout << "\n";
+            std::cout << "\n\n";
     }
-    std::cout << "]\n";
+    std::cout << "]";
 }
 
 template<typename T>
@@ -105,7 +105,7 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int
             std::cout << ind;
         _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
         if (i != dim1 - 1)
-            std::cout << "\n\n";
+            std::cout << "\n\n\n";
     }
     if (start1 != end1) {
         std::cout << ind;
@@ -115,9 +115,9 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int
         std::cout << ind;
         _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
         if (i != end1 - 1)
-            std::cout << "\n";
+            std::cout << "\n\n\n";
     }
-    std::cout << "]\n";
+    std::cout << "]";
 }
 
 template<typename T>
@@ -126,15 +126,7 @@ static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int
     T* out = (T*)malloc(sizeof(T) * size);
     cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
     _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1);
-
-    /*
-    if (stride2 != dim3) {
-        for (int i = dim1 * dim2 * 3 * dim3 - 1 * dim3 - 8; i < dim1 * dim2 * 3 * dim3 - 1 * dim3; ++i) {
-            std::cout << out[i] << " ";
-        }
-        std::cout << "\n";
-    }
-    */
+    std::cout << "\n";
     free(out);
 }
 
@@ -151,9 +143,6 @@ print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int st
     T* out = (T*)malloc(sizeof(T) * size);
     cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
     _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1);
-    for (int i = dim1 * dim2 * dim3 * dim4 - 8; i < dim1 * dim2 * dim3 * dim4; ++i) {
-        std::cout << out[i] << " ";
-    }
     std::cout << "\n";
     free(out);
 }

From df743e0c8341eedf9876ebf67c4c5761dcabb3c4 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 25 Sep 2023 12:46:11 +0000
Subject: [PATCH 31/55] no cache version

---
 .../LLaMAContextAttentionLayer.cc             | 161 +++++++++++-------
 src/fastertransformer/models/llama/LLaMA.cc   |  16 +-
 .../models/llama/LLaMAContextDecoder.cc       |   8 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |  11 +-
 src/fastertransformer/utils/llama_utils.h     |  10 ++
 src/fastertransformer/utils/memory_utils.cu   |   2 +-
 6 files changed, 121 insertions(+), 87 deletions(-)

diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 5aecc5de6..111f09740 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -68,6 +68,12 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     PUSH_RANGE("qkv_gemm");
 
+    //std::cout << "G1====================================\n";
+    //std::cout << "hidden_units_: " << hidden_units_ << "\n";
+    //std::cout << "m: " << m << "\n";
+    //std::cout << "G1====================================\n";
+    //std::cout << std::flush;
+
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
                           CUBLAS_OP_N,
                           3 * hidden_units_,  // n
@@ -80,62 +86,62 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-//    if (true) {
-//        print_tensor3(qkv_buf_,
-//                      batch_size,
-//                      seq_len,
-//                      hidden_units_,
-//                      seq_len * 3 * hidden_units_,
-//                      3 * hidden_units_,
-//                      batch_size * seq_len * 3 * hidden_units_,
-//                      0);
-//        print_tensor3(qkv_buf_,
-//                      batch_size,
-//                      seq_len,
-//                      hidden_units_,
-//                      seq_len * 3 * hidden_units_,
-//                      3 * hidden_units_,
-//                      batch_size * seq_len * 3 * hidden_units_,
-//                      hidden_units_);
-//        print_tensor3(qkv_buf_,
-//                      batch_size,
-//                      seq_len,
-//                      hidden_units_,
-//                      seq_len * 3 * hidden_units_,
-//                      3 * hidden_units_,
-//                      batch_size * seq_len * 3 * hidden_units_,
-//                      2*hidden_units_);
-//    }
-//    if (true) {
-//        print_tensor4(qkv_buf_,
-//                batch_size, seq_len, head_num_, size_per_head_,
-//                seq_len * 3 * head_num_ * size_per_head_,
-//                3 * head_num_ * size_per_head_,
-//                size_per_head_,
-//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-//                0
-//                );
-//        print_tensor4(qkv_buf_,
-//                batch_size, seq_len, head_num_, size_per_head_,
-//                seq_len * 3 * head_num_ * size_per_head_,
-//                3 * head_num_ * size_per_head_,
-//                size_per_head_,
-//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-//                head_num_ * size_per_head_
-//                );
-//        print_tensor4(qkv_buf_,
-//                batch_size, seq_len, head_num_, size_per_head_,
-//                seq_len * 3 * head_num_ * size_per_head_,
-//                3 * head_num_ * size_per_head_,
-//                size_per_head_,
-//                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-//                2 * head_num_ * size_per_head_
-//                );
-//    }
+    //    if (true) {
+    //        print_tensor3(qkv_buf_,
+    //                      batch_size,
+    //                      seq_len,
+    //                      hidden_units_,
+    //                      seq_len * 3 * hidden_units_,
+    //                      3 * hidden_units_,
+    //                      batch_size * seq_len * 3 * hidden_units_,
+    //                      0);
+    //        print_tensor3(qkv_buf_,
+    //                      batch_size,
+    //                      seq_len,
+    //                      hidden_units_,
+    //                      seq_len * 3 * hidden_units_,
+    //                      3 * hidden_units_,
+    //                      batch_size * seq_len * 3 * hidden_units_,
+    //                      hidden_units_);
+    //        print_tensor3(qkv_buf_,
+    //                      batch_size,
+    //                      seq_len,
+    //                      hidden_units_,
+    //                      seq_len * 3 * hidden_units_,
+    //                      3 * hidden_units_,
+    //                      batch_size * seq_len * 3 * hidden_units_,
+    //                      2*hidden_units_);
+    //    }
+    //    if (true) {
+    //        print_tensor4(qkv_buf_,
+    //                batch_size, seq_len, head_num_, size_per_head_,
+    //                seq_len * 3 * head_num_ * size_per_head_,
+    //                3 * head_num_ * size_per_head_,
+    //                size_per_head_,
+    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+    //                0
+    //                );
+    //        print_tensor4(qkv_buf_,
+    //                batch_size, seq_len, head_num_, size_per_head_,
+    //                seq_len * 3 * head_num_ * size_per_head_,
+    //                3 * head_num_ * size_per_head_,
+    //                size_per_head_,
+    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+    //                head_num_ * size_per_head_
+    //                );
+    //        print_tensor4(qkv_buf_,
+    //                batch_size, seq_len, head_num_, size_per_head_,
+    //                seq_len * 3 * head_num_ * size_per_head_,
+    //                3 * head_num_ * size_per_head_,
+    //                size_per_head_,
+    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
+    //                2 * head_num_ * size_per_head_
+    //                );
+    //    }
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(k_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+        cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
         sync_check_cuda_error();
     }
     invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
@@ -151,12 +157,11 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         rotary_embedding_dim_,
                                         stream_);
     sync_check_cuda_error();
-//    if (true) {
-//        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-//        print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-//        print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-//    }
-
+    //    if (true) {
+    //        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    //        print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    //        print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    //    }
 
     //    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
     //    // Use batch major
@@ -194,6 +199,15 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         //
         if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
             PUSH_RANGE("Q*K batch gemm");
+            //std::cout << "G2====================================\n";
+            //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
+            //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
+            //std::cout << "batch_size: " << batch_size << "\n";
+            //std::cout << "head_num_: " << head_num_ << "\n";
+            //std::cout << "size_per_head_: " << size_per_head_ << "\n";
+            //std::cout << "G2====================================\n";
+            //std::cout << std::flush;
+
             cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                                 CUBLAS_OP_N,
                                                 attention_seq_len_2,  // n
@@ -215,7 +229,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                                 attention_seq_len_2 * attention_seq_len_1,
                                                 batch_size * head_num_,  // global batch size
                                                 CUDA_R_32F);
-
             sync_check_cuda_error();
             POP_RANGE;
 
@@ -236,6 +249,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         }
         else {
             PUSH_RANGE("Q*K batch gemm");
+            //std::cout << "G2====================================\n";
+            //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
+            //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
+            //std::cout << "batch_size: " << batch_size << "\n";
+            //std::cout << "head_num_: " << head_num_ << "\n";
+            //std::cout << "size_per_head_: " << size_per_head_ << "\n";
+            //std::cout << "G2====================================\n";
+            //std::cout << std::flush;
             cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                                 CUBLAS_OP_N,
                                                 attention_seq_len_2,
@@ -270,6 +291,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         }
 
         PUSH_RANGE("QK*V batch gemm");
+        //std::cout << "G3====================================\n";
+        //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
+        //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
+        //std::cout << "batch_size: " << batch_size << "\n";
+        //std::cout << "head_num_: " << head_num_ << "\n";
+        //std::cout << "size_per_head_: " << size_per_head_ << "\n";
+        //std::cout << "G3====================================\n";
+        //std::cout << std::flush;
         cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
                                             CUBLAS_OP_N,
                                             size_per_head_,
@@ -433,26 +462,26 @@ template<typename T>
 void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, true);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, true);
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, true);
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
         qk_buf_ = nullptr;
     }
-    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, true);
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
 
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, true);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index cdf8071e6..988809cfa 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -58,10 +58,8 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
 
     input_attention_mask_ =
         (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
-    decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
     normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
@@ -84,7 +82,6 @@ void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
-        allocator_->free((void**)(&decoder_output_buf_));
         allocator_->free((void**)(&logits_buf_));
 
         allocator_->free((void**)(&key_cache_));
@@ -205,7 +202,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        const std::unordered_map<std::string, Tensor>* input_tensors,
                        const LLaMAWeight<T>*                          llama_weights)
 {
-    // Logger::getLogger().setLevel(Logger::Level::TRACE);
+    // Logger::getLogger().setLevel(Logger::Level::DEBUG);
+    //
     // input_tensors:
     //      input_ids [batch_size, seq_len]
     //      input_lengths [batch_size]
@@ -265,10 +263,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                       hidden_units_,
                                       stream_);
         sync_check_cuda_error();
-
-//        std::cout << 0 << "==================" << "EMBEDDING\n";
-//        print_tensor3(context_decoder_input_buf_, batch_size, seq_len, hidden_units_);
-//        std::cout << 0 << "==================" << "EMBEDDING\n";
     }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
@@ -312,10 +306,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                               vocab_size_);
         sync_check_cuda_error();
 
+
         if (std::is_same<T, half>::value) {
             float* output_logits = output_tensors->at("output_logits").getPtr<float>();
-            invokeCudaCast<float, T>(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
+            invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
             sync_check_cuda_error();
+            //print_tensor3(output_logits, batch_size, seq_len, vocab_size_);
         }
     }
 }
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 9ae318554..41876cc4e 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -67,7 +67,7 @@ void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     decoder_layer_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), false, false);
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
@@ -325,12 +325,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
-        //        std::cout << l << "==================" << "QBUF\n";
+        //std::cout << l << "==================" << "ATTENTION\n";
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
-        //        std::cout << l << "==================" << "QBUF\n";
-        //        std::cout << std::flush;
+        //std::cout << l << "==================" << "ATTENTION\n";
+        //std::cout << std::flush;
 
         if (false) {
             std::cout << l << "=================="
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index ab594c5c7..9a7cb9168 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -114,16 +114,15 @@ class FTLLaMA: public IFLLaMA {
         delete cublas_wrapper_mutex_;
     }
 
-    virtual void forward(th::Tensor&   output_logits,
-                         th::Tensor&   input_ids,
-                         th::Tensor&   input_lengths,
-                         const int start_pos) override
+    virtual void
+    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override
     {
         auto           stream       = at::cuda::getCurrentCUDAStream().stream();
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
         cublasSetStream(cublasHandle, stream);
-        ft::Allocator<ft::AllocatorType::TH> allocator      = ft::Allocator<ft::AllocatorType::TH>();
-        ft::cublasMMWrapper                  cublas_wrapper = ft::cublasMMWrapper(
+        ft::Allocator<ft::AllocatorType::CUDA> allocator =
+            ft::Allocator<ft::AllocatorType::CUDA>(at::cuda::getCurrentCUDAStream().device_index());
+        ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper(
             cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator);
 
         if (std::is_same<T, half>::value) {
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
index a840c4749..d23f5ba8e 100644
--- a/src/fastertransformer/utils/llama_utils.h
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -120,6 +120,16 @@ _print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int
     std::cout << "]";
 }
 
+template<typename T>
+static void print_tensor1(T* in, int dim1)
+{
+    T* out = (T*)malloc(sizeof(T) * dim1);
+    cudaMemcpy(out, in, sizeof(T) * dim1, cudaMemcpyDeviceToHost);
+    _print_tensor1(out, dim1, 1);
+    std::cout << "\n";
+    free(out);
+}
+
 template<typename T>
 static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start)
 {
diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu
index d795cbf99..134224a09 100644
--- a/src/fastertransformer/utils/memory_utils.cu
+++ b/src/fastertransformer/utils/memory_utils.cu
@@ -177,7 +177,7 @@ __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
 template<typename T_OUT, typename T_IN>
 void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
 {
-    cudaCast<<<(size + 255) / 256, 256, 0, stream>>>(dst, src, size);
+    cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
 }
 
 template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);

From 220aec06594944fc787dfd170f9b4f50bfce1350 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 25 Sep 2023 12:51:11 +0000
Subject: [PATCH 32/55] no-cache version bug fix

---
 src/fastertransformer/models/llama/LLaMA.cc   |  1 -
 .../models/llama/LLaMAContextDecoder.cc       | 23 +------------------
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 988809cfa..e8abe28f9 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -311,7 +311,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             float* output_logits = output_tensors->at("output_logits").getPtr<float>();
             invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
             sync_check_cuda_error();
-            //print_tensor3(output_logits, batch_size, seq_len, vocab_size_);
         }
     }
 }
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 41876cc4e..41abb0006 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -67,7 +67,7 @@ void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     decoder_layer_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), false, false);
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
@@ -288,15 +288,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                     stream_);
         sync_check_cuda_error();
 
-        if (false) {
-            std::cout << l << "=================="
-                      << "ATTN_NORM\n";
-            print_tensor3(decoder_normed_input_, batch_size, seq_len, hidden_units_);
-            std::cout << l << "=================="
-                      << "ATTN_NORM\n";
-            std::cout << std::flush;
-        }
-
         TensorMap self_attention_input_tensors{
             {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
             {"attention_mask",
@@ -325,21 +316,9 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
-        //std::cout << l << "==================" << "ATTENTION\n";
         self_attention_layer_->forward(&self_attention_output_tensors,
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
-        //std::cout << l << "==================" << "ATTENTION\n";
-        //std::cout << std::flush;
-
-        if (false) {
-            std::cout << l << "=================="
-                      << "ATTENTION\n";
-            print_tensor3(self_attn_output_, batch_size, seq_len, hidden_units_);
-            std::cout << l << "=================="
-                      << "ATTENTION\n";
-            std::cout << std::flush;
-        }
 
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,

From 4fb06e7ba2cac488f9f144acb87d3087d328e35d Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 25 Sep 2023 16:49:39 +0000
Subject: [PATCH 33/55] cache version

---
 src/fastertransformer/kernels/CMakeLists.txt  |   4 +
 .../kernels/llama_kernels.cu                  |  56 +++++
 src/fastertransformer/kernels/llama_kernels.h |  15 ++
 .../kernels/unfused_attention_kernels.cu      | 214 ++++++++++++++++--
 .../kernels/unfused_attention_kernels.h       |  26 +++
 .../LLaMAContextAttentionLayer.cc             | 166 ++++----------
 .../LLaMAContextAttentionLayer.h              |   2 +-
 .../models/llama/CMakeLists.txt               |   3 +-
 src/fastertransformer/models/llama/LLaMA.cc   |  33 +--
 src/fastertransformer/models/llama/LLaMA.h    |  21 +-
 .../models/llama/LLaMAContextDecoder.cc       |   1 +
 src/fastertransformer/th_op/llama/LLaMA.h     | 103 +++++----
 src/fastertransformer/utils/llama_utils.h     |  10 +
 13 files changed, 437 insertions(+), 217 deletions(-)
 create mode 100644 src/fastertransformer/kernels/llama_kernels.cu
 create mode 100644 src/fastertransformer/kernels/llama_kernels.h

diff --git a/src/fastertransformer/kernels/CMakeLists.txt b/src/fastertransformer/kernels/CMakeLists.txt
index fd2a1b494..c5cc14c8e 100644
--- a/src/fastertransformer/kernels/CMakeLists.txt
+++ b/src/fastertransformer/kernels/CMakeLists.txt
@@ -233,3 +233,7 @@ add_library(moe_kernels STATIC moe_kernels.cu)
 set_property(TARGET moe_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET moe_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(moe_kernels PRIVATE moe_gemm_kernels)
+
+add_library(llama_kernels STATIC llama_kernels.cu)
+set_property(TARGET llama_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET llama_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
new file mode 100644
index 000000000..3c753f866
--- /dev/null
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -0,0 +1,56 @@
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void LLaMAbuildDecoderAttentionMaskKernel(
+    T* attention_mask, const int* sequence_lengths, const int batch_size, const int seq_len, const int start_pos)
+{
+    // sequence_lengths:
+    // [batch_size]
+    // attention_mask:
+    // [batch_size, 1, seq_len, seq_len + start_pos]
+    const int max_length        = seq_len + start_pos;
+    const int mask_size_per_seq = seq_len * max_length;
+    attention_mask += blockIdx.x * mask_size_per_seq;
+    const int seq_length = sequence_lengths[blockIdx.x];
+
+    for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
+        int row_id = i / max_length;
+        int col_id = i % max_length;
+        if (row_id < seq_length && col_id <= (row_id + start_pos)) {
+            attention_mask[i] = (T)(1.0f);
+        }
+        else {
+            attention_mask[i] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
+                                          const int*   sequence_lengths,
+                                          const int    batch_size,
+                                          const int    seq_len,
+                                          const int    start_pos,
+                                          cudaStream_t stream)
+{
+    LLaMAbuildDecoderAttentionMaskKernel<T>
+        <<<batch_size, 256, 0, stream>>>(attention_mask, sequence_lengths, batch_size, seq_len, start_pos);
+}
+
+template void invokeLLaMABuildDecoderAttentionMask(float*       attention_mask,
+                                                   const int*   sequence_lengths,
+                                                   const int    batch_size,
+                                                   const int    seq_len,
+                                                   const int    start_pos,
+                                                   cudaStream_t stream);
+
+template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
+                                                   const int*   sequence_lengths,
+                                                   const int    batch_size,
+                                                   const int    seq_len,
+                                                   const int    start_pos,
+                                                   cudaStream_t stream);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
new file mode 100644
index 000000000..320b5624f
--- /dev/null
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -0,0 +1,15 @@
+#pragma once
+
+
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+namespace fastertransformer {
+
+template<typename T>
+void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
+                                          const int*   sequence_lengths,
+                                          const int    batch_size,
+                                          const int    seq_len,
+                                          const int    start_pos,
+                                          cudaStream_t stream);
+} // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 97df58261..1010ca3f3 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1591,7 +1591,8 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
                                                          const int  seq_len,
                                                          const int  head_num,
                                                          const int  size_per_head,
-                                                         const int  rotary_embedding_dim)
+                                                         const int  rotary_embedding_dim,
+                                                         const int  start_pos)
 {
     constexpr int vec_size         = Vec_t<T>::size;
     using Vec_t                    = typename Vec_t<T>::Type;
@@ -1610,9 +1611,9 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
     const int hidden_idx = head_idx * size_per_head + tidx * vec_size;
     const int n          = head_num * size_per_head;
 
-    const int src_q_idx      = token_idx * 3 * n + hidden_idx;
-    const int src_k_idx      = token_idx * 3 * n + hidden_idx + n;
-    const int src_v_idx      = token_idx * 3 * n + hidden_idx + 2 * n;
+    const int src_q_idx = token_idx * 3 * n + hidden_idx;
+    const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
+    const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
 
     Vec_t q, k, v;
     if (!is_masked) {
@@ -1621,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
         v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
     }
 
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, seq_idx);
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos + seq_idx);
 
     const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
                            + seq_idx * size_per_head + tidx * vec_size;
@@ -1648,12 +1649,22 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    head_num,
                                          const int    size_per_head,
                                          const int    rotary_embedding_dim,
+                                         const int    start_pos,
                                          cudaStream_t stream)
 {
-    dim3   block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
-    dim3   grid(token_num, head_num);
-    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(
-        q_buf, k_buf, v_buf, QKV, padding_offset, batch_size, seq_len, head_num, size_per_head, rotary_embedding_dim);
+    dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
+    dim3 grid(token_num, head_num);
+    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(q_buf,
+                                                                         k_buf,
+                                                                         v_buf,
+                                                                         QKV,
+                                                                         padding_offset,
+                                                                         batch_size,
+                                                                         seq_len,
+                                                                         head_num,
+                                                                         size_per_head,
+                                                                         rotary_embedding_dim,
+                                                                         start_pos);
 }
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
@@ -1667,6 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
+                                                  const int    start_pos,
                                                   cudaStream_t stream);
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
@@ -1680,6 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
+                                                  const int    start_pos,
                                                   cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
@@ -1693,6 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
                                                   const int      head_num,
                                                   const int      size_per_head,
                                                   const int      rotary_embedding_dim,
+                                                  const int      start_pos,
                                                   cudaStream_t   stream);
 #endif
 
@@ -1875,6 +1889,7 @@ void invokeTranspose4dBatchMajor(T*           k_dst,
                                               const int    size_per_head,                                              \
                                               const int    local_head_num,                                             \
                                               cudaStream_t stream)
+
 INSTANTIATETRANSPOSE4DBATCHMAJOR(float);
 INSTANTIATETRANSPOSE4DBATCHMAJOR(half);
 #ifdef ENABLE_BF16
@@ -1882,6 +1897,169 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16);
 #endif
 #undef INSTANTIATETRANSPOSE4DBATCHMAJOR
 
+template<typename T>
+__global__ void transpose_4d_save_to_cache(T*        k_dst,
+                                           const T*  k_src,
+                                           T*        v_dst,
+                                           const T*  v_src,
+                                           const int head_num,
+                                           const int size_per_head,
+                                           const int seq_len,
+                                           const int max_seq_len,
+                                           const int start_pos)
+{
+    // [batch_size, head_num, seq_len, size_per_head]
+    const int batch_id = blockIdx.y;
+    const int head_id  = blockIdx.z;
+
+    // 16 byte loads will handle "x" dimension
+    auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
+                                            + head_id * size_per_head * max_seq_len
+                                            + start_pos * size_per_head
+                                            );
+    auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
+                                            + head_id * size_per_head * max_seq_len
+                                            + start_pos * size_per_head
+                                            );
+
+    // idx is over output dimension L * size_per_head / x for values
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
+    const int     size_per_head_div_x = size_per_head / X_ELEMS;
+
+    if (idx >= size_per_head_div_x * seq_len) {
+        return;
+    }
+
+    key_dst[idx] = key_src[idx];
+    val_dst[idx] = val_src[idx];
+}
+
+template<typename T>
+void invokeLLaMASaveToCache(T*           k_dst,
+                            T*           v_dst,
+                            const T*     k_src,
+                            const T*     v_src,
+                            const int    local_batch_size,
+                            const int    seq_len,
+                            const int    max_seq_len,
+                            const int    size_per_head,
+                            const int    local_head_num,
+                            const int    start_pos,
+                            cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    dim3          grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+
+    transpose_4d_save_to_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos);
+}
+
+#define INSTANTIATESAVETOCACHE(T)                                                                                      \
+    template void invokeLLaMASaveToCache(T*           k_dst,                                                           \
+                                         T*           v_dst,                                                           \
+                                         const T*     k_src,                                                           \
+                                         const T*     v_src,                                                           \
+                                         const int    local_batch_size,                                                \
+                                         const int    seq_len,                                                         \
+                                         const int    max_seq_len,                                                     \
+                                         const int    size_per_head,                                                   \
+                                         const int    local_head_num,                                                  \
+                                         const int    start_pos,                                                       \
+                                         cudaStream_t stream)
+INSTANTIATESAVETOCACHE(float);
+INSTANTIATESAVETOCACHE(half);
+#ifdef ENABLE_BF16
+INSTANTIATESAVETOCACHE(__nv_bfloat16);
+#endif
+#undef INSTANTIATESAVETOCACHE
+
+template<typename T>
+__global__ void transpose_4d_load_from_cache(T*        k_dst,
+                                             const T*  k_src,
+                                             T*        v_dst,
+                                             const T*  v_src,
+                                             const int head_num,
+                                             const int size_per_head,
+                                             const int seq_len,
+                                             const int max_seq_len,
+                                             const int start_pos)
+{
+    // [batch_size, head_num, start_pos+seq_len, size_per_head]
+    const int batch_id     = blockIdx.y;
+    const int head_id      = blockIdx.z;
+    const int real_seq_len = start_pos + seq_len;
+
+    // 16 byte loads will handle "x" dimension
+    auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * max_seq_len
+                                                  + head_id * size_per_head * max_seq_len);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * real_seq_len
+                                            + head_id * size_per_head * real_seq_len);
+    auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * max_seq_len
+                                                  + head_id * size_per_head * max_seq_len);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * real_seq_len
+                                            + head_id * size_per_head * real_seq_len);
+
+    // idx is over output dimension L * size_per_head / x for values
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
+    const int     size_per_head_div_x = size_per_head / X_ELEMS;
+
+    if (idx >= size_per_head_div_x * real_seq_len) {
+        return;
+    }
+
+    key_dst[idx] = key_src[idx];
+    val_dst[idx] = val_src[idx];
+}
+
+template<typename T>
+void invokeLLaMALoadFromCache(T*           k_dst,
+                              T*           v_dst,
+                              const T*     k_src,
+                              const T*     v_src,
+                              const int    local_batch_size,
+                              const int    seq_len,
+                              const int    max_seq_len,
+                              const int    size_per_head,
+                              const int    local_head_num,
+                              const int    start_pos,
+                              cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    dim3 grid(((start_pos + seq_len) * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+
+    transpose_4d_load_from_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos);
+}
+
+#define INSTANTIATELOADFROMCACHE(T)                                                                                    \
+    template void invokeLLaMALoadFromCache(T*           k_dst,                                                         \
+                                           T*           v_dst,                                                         \
+                                           const T*     k_src,                                                         \
+                                           const T*     v_src,                                                         \
+                                           const int    local_batch_size,                                              \
+                                           const int    seq_len,                                                       \
+                                           const int    max_seq_len,                                                   \
+                                           const int    size_per_head,                                                 \
+                                           const int    local_head_num,                                                \
+                                           const int    start_pos,                                                     \
+                                           cudaStream_t stream)
+INSTANTIATELOADFROMCACHE(float);
+INSTANTIATELOADFROMCACHE(half);
+#ifdef ENABLE_BF16
+INSTANTIATELOADFROMCACHE(__nv_bfloat16);
+#endif
+#undef INSTANTIATELOADFROMCACHE
+
 template<typename T>
 __global__ void addRelativeAttentionBias(
     T* qk_buf, const T* relative_attention_bias, const int batch_size, const int head_num, const int seq_len)
@@ -1942,8 +2120,8 @@ INSTANTIATEADDRELATIVEATTENTIONBIAS(__nv_bfloat16);
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, 3*batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, 3*batch);
 // block(num_head * size_per_head)
 template<typename T>
 __global__ void add_head3Size_QKV_bias(const T*  mm_qkv,
@@ -1993,8 +2171,8 @@ __global__ void add_head3Size_QKV_bias(const T*  mm_qkv,
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, 3*batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, 3*batch);
 // block(num_head * size_per_head)
 template<>
 __global__ void add_head3Size_QKV_bias(const float2* mm_qkv,
@@ -2046,8 +2224,8 @@ __global__ void add_head3Size_QKV_bias(const float2* mm_qkv,
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, batch);
 // block(num_head * size_per_head)
 template<>
 __global__ void add_head3Size_QKV_bias(const half2* mm_qkv,
@@ -2237,7 +2415,8 @@ INSTANTIATEADDHEAD3SIZEQKVBIAS(__nv_bfloat16);
 #endif
 #undef INSTANTIATEADDHEAD3SIZEQKVBIAS
 
-/*******************  invokeMaskedSoftMaxWithRelPosBias  ***********************/
+/*******************  invokeMaskedSoftMaxWithRelPosBias
+ * ***********************/
 
 // grid = (window_len/word_per_thread, window_num*num_head, batch_size)
 // block.x = max(32, (window_len + 31)/32*32)
@@ -2586,7 +2765,8 @@ __global__ void transpose_attentions(
     // attentions_in  shape [B, H, S, S]
     // attentions_out shape [B, L, H, S, S].
     // Note that we write the L dimension as if it was index 0.
-    // In reality, the pointer has already been shifted to point to the correct layer.
+    // In reality, the pointer has already been shifted to point to the
+    // correct layer.
 
     const auto batch_idx = blockIdx.x;
     const auto head_idx  = blockIdx.y;
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 0ccf64d8c..2d4b01dde 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -125,6 +125,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    head_num,
                                          const int    size_per_head,
                                          const int    rotary_embedding_dim,
+                                         const int    start_pos,
                                          cudaStream_t stream);
 
 template<typename T>
@@ -203,6 +204,31 @@ void invokeTranspose4dBatchMajor(T*           k_dst,
                                  const int    local_head_num,
                                  cudaStream_t stream);
 
+template<typename T>
+void invokeLLaMASaveToCache(T*           k_dst,
+                            T*           v_dst,
+                            const T*     k_src,
+                            const T*     v_src,
+                            const int    local_batch_size,
+                            const int    seq_len,
+                            const int    max_seq_len,
+                            const int    size_per_head,
+                            const int    local_head_num,
+                            const int    start_pos,
+                            cudaStream_t stream);
+template<typename T>
+void invokeLLaMALoadFromCache(T*           k_dst,
+                              T*           v_dst,
+                              const T*     k_src,
+                              const T*     v_src,
+                              const int    local_batch_size,
+                              const int    seq_len,
+                              const int    max_seq_len,
+                              const int    size_per_head,
+                              const int    local_head_num,
+                              const int    start_pos,
+                              cudaStream_t stream);
+
 template<typename T>
 void invokeAddRelativeAttentionBias(T*           qk_buf,
                                     const T*     relative_attention_bias,
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 111f09740..a9989543a 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -39,13 +39,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     // output_tensors:
     //      hidden_features [token_num, hidden_dimension]
-    //      key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x]
+    //      key_cache [batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-    FT_CHECK(output_tensors->at("key_cache").shape.size() == 5);
+    FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
     const int  batch_size     = input_tensors->at("attention_mask").shape[0];
     const int  seq_len        = input_tensors->at("attention_mask").shape[2];
+    const int  max_seq_len    = (int)(output_tensors->at("key_cache").shape[2]);
     const int  layer_id       = input_tensors->getVal<int>("layer_id");
     const int* padding_offset = input_tensors->getPtr<int>("padding_offset", nullptr);
     int*       cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
@@ -60,7 +61,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                        "LLaMA Context FUSED_PADDED_MHA is not supported !");
 
     PUSH_RANGE("attention buffer alloc");
-    allocateBuffer(batch_size, seq_len, attention_type != AttentionType::FUSED_MHA);
+    allocateBuffer(batch_size, seq_len, max_seq_len, attention_type != AttentionType::FUSED_MHA);
     POP_RANGE;
     sync_check_cuda_error();
 
@@ -68,12 +69,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     PUSH_RANGE("qkv_gemm");
 
-    //std::cout << "G1====================================\n";
-    //std::cout << "hidden_units_: " << hidden_units_ << "\n";
-    //std::cout << "m: " << m << "\n";
-    //std::cout << "G1====================================\n";
-    //std::cout << std::flush;
-
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
                           CUBLAS_OP_N,
                           3 * hidden_units_,  // n
@@ -86,58 +81,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           qkv_buf_,
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
-    //    if (true) {
-    //        print_tensor3(qkv_buf_,
-    //                      batch_size,
-    //                      seq_len,
-    //                      hidden_units_,
-    //                      seq_len * 3 * hidden_units_,
-    //                      3 * hidden_units_,
-    //                      batch_size * seq_len * 3 * hidden_units_,
-    //                      0);
-    //        print_tensor3(qkv_buf_,
-    //                      batch_size,
-    //                      seq_len,
-    //                      hidden_units_,
-    //                      seq_len * 3 * hidden_units_,
-    //                      3 * hidden_units_,
-    //                      batch_size * seq_len * 3 * hidden_units_,
-    //                      hidden_units_);
-    //        print_tensor3(qkv_buf_,
-    //                      batch_size,
-    //                      seq_len,
-    //                      hidden_units_,
-    //                      seq_len * 3 * hidden_units_,
-    //                      3 * hidden_units_,
-    //                      batch_size * seq_len * 3 * hidden_units_,
-    //                      2*hidden_units_);
-    //    }
-    //    if (true) {
-    //        print_tensor4(qkv_buf_,
-    //                batch_size, seq_len, head_num_, size_per_head_,
-    //                seq_len * 3 * head_num_ * size_per_head_,
-    //                3 * head_num_ * size_per_head_,
-    //                size_per_head_,
-    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-    //                0
-    //                );
-    //        print_tensor4(qkv_buf_,
-    //                batch_size, seq_len, head_num_, size_per_head_,
-    //                seq_len * 3 * head_num_ * size_per_head_,
-    //                3 * head_num_ * size_per_head_,
-    //                size_per_head_,
-    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-    //                head_num_ * size_per_head_
-    //                );
-    //        print_tensor4(qkv_buf_,
-    //                batch_size, seq_len, head_num_, size_per_head_,
-    //                seq_len * 3 * head_num_ * size_per_head_,
-    //                3 * head_num_ * size_per_head_,
-    //                size_per_head_,
-    //                batch_size * seq_len * 3 * head_num_ * size_per_head_,
-    //                2 * head_num_ * size_per_head_
-    //                );
-    //    }
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
@@ -155,34 +98,39 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         head_num_,
                                         size_per_head_,
                                         rotary_embedding_dim_,
+                                        start_pos,
                                         stream_);
     sync_check_cuda_error();
-    //    if (true) {
-    //        print_tensor4(q_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    //        print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    //        print_tensor4(v_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    //    }
-
-    //    const int max_seq_len = (int)(output_tensors->at("key_cache").shape[3]);  // max output seq length
-    //    // Use batch major
-    //    // put k/v_buf from shape [B, H, L, Dh]
-    //    // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
-    //    // TODO: Cache implementation
-    //    // k_cache: [batch_size, num_heads, L, Dh]
-    //    // k_buf: [batch_size, num_heads, start_pos + seq_len, Dh]
-    //    // v_buf: [batch_size, num_heads, L, Dh]
-    //    invokeTranspose4dBatchMajor(output_tensors->getPtr<T>("key_cache"),
-    //                                output_tensors->getPtr<T>("value_cache"),
-    //                                k_buf_2_,
-    //                                v_buf_2_,
-    //                                batch_size,
-    //                                seq_len,
-    //                                max_seq_len,
-    //                                size_per_head_,
-    //                                head_num_,
-    //                                stream_);
-    //    sync_check_cuda_error();
-    //    POP_RANGE;
+
+    // key_cache [batch, local_head_num, max_seq_len, size_per_head]
+    // value_cache [batch, local_head_num, max_seq_len, size_per_head]
+    T* key_cache   = output_tensors->getPtr<T>("key_cache");
+    T* value_cache = output_tensors->getPtr<T>("value_cache");
+    invokeLLaMASaveToCache(key_cache,
+                           value_cache,
+                           k_buf_2_,
+                           v_buf_2_,
+                           batch_size,
+                           seq_len,
+                           max_seq_len,
+                           size_per_head_,
+                           head_num_,
+                           start_pos,
+                           stream_);
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    invokeLLaMALoadFromCache(k_buf_2_,
+                             v_buf_2_,
+                             key_cache,
+                             value_cache,
+                             batch_size,
+                             seq_len,
+                             max_seq_len,
+                             size_per_head_,
+                             head_num_,
+                             start_pos,
+                             stream_);
 
     if (attention_type == AttentionType::FUSED_MHA) {
         dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
@@ -190,8 +138,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     }
     else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = seq_len;  // q length
-        const int            attention_seq_len_2 = seq_len;  // kv length
+        const int            attention_seq_len_1 = seq_len;              // q length
+        const int            attention_seq_len_2 = start_pos + seq_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
 
         //
@@ -199,14 +147,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         //
         if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
             PUSH_RANGE("Q*K batch gemm");
-            //std::cout << "G2====================================\n";
-            //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
-            //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
-            //std::cout << "batch_size: " << batch_size << "\n";
-            //std::cout << "head_num_: " << head_num_ << "\n";
-            //std::cout << "size_per_head_: " << size_per_head_ << "\n";
-            //std::cout << "G2====================================\n";
-            //std::cout << std::flush;
 
             cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                                 CUBLAS_OP_N,
@@ -249,14 +189,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         }
         else {
             PUSH_RANGE("Q*K batch gemm");
-            //std::cout << "G2====================================\n";
-            //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
-            //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
-            //std::cout << "batch_size: " << batch_size << "\n";
-            //std::cout << "head_num_: " << head_num_ << "\n";
-            //std::cout << "size_per_head_: " << size_per_head_ << "\n";
-            //std::cout << "G2====================================\n";
-            //std::cout << std::flush;
             cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                                 CUBLAS_OP_N,
                                                 attention_seq_len_2,
@@ -291,14 +223,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         }
 
         PUSH_RANGE("QK*V batch gemm");
-        //std::cout << "G3====================================\n";
-        //std::cout << "attention_seq_len_1: " << attention_seq_len_1 << "\n";
-        //std::cout << "attention_seq_len_2: " << attention_seq_len_2 << "\n";
-        //std::cout << "batch_size: " << batch_size << "\n";
-        //std::cout << "head_num_: " << head_num_ << "\n";
-        //std::cout << "size_per_head_: " << size_per_head_ << "\n";
-        //std::cout << "G3====================================\n";
-        //std::cout << std::flush;
         cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
                                             CUBLAS_OP_N,
                                             size_per_head_,
@@ -459,17 +383,21 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf)
+void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
+                                                   size_t seq_len,
+                                                   size_t max_seq_len,
+                                                   bool   allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * hidden_units_, false);
-    k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
-    v_buf_2_ = k_buf_2_ + batch_size * seq_len * hidden_units_;
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * max_seq_len * 3 * hidden_units_, false);
+    k_buf_2_ = q_buf_2_ + batch_size * max_seq_len * hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * max_seq_len * hidden_units_;
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * seq_len, false);
+        qk_buf_ =
+            (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * max_seq_len * max_seq_len, false);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
@@ -481,7 +409,7 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * seq_len, false);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * max_seq_len * max_seq_len, false);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 85fd74af8..7300186ba 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -38,7 +38,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     std::unique_ptr<MHARunner> dispatcher_fp16;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len, bool allocate_qk_buf);
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len, bool allocate_qk_buf);
     void freeBuffer() override;
 
     using BaseAttentionLayer<T>::is_free_buffer_after_forward_;
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
index 0c5106f00..24acf1d78 100644
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -27,7 +27,7 @@ target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper
                       FfnLayer
                       layernorm_kernels
                       add_residual_kernels
-                      gpt_kernels
+                      llama_kernels
                       tensor
                       nccl_utils
                       cuda_utils
@@ -45,6 +45,7 @@ target_link_libraries(LLaMA PUBLIC -lcudart
                       LLaMAContextDecoder
                       decoding_kernels
                       gpt_kernels
+                      llama_kernels
                       BaseBeamSearchLayer
                       bert_preprocess_kernels
                       tensor
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index e8abe28f9..29caa3722 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -17,10 +17,10 @@
 #include "src/fastertransformer/models/llama/LLaMA.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
 #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
-#include "src/fastertransformer/utils/memory_utils.h"
 #include "src/fastertransformer/utils/llama_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
 #include <algorithm>
 #include <type_traits>
 
@@ -57,7 +57,7 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_;
 
     input_attention_mask_ =
-        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false));
     normed_decoder_output_buf_ =
         (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
@@ -85,9 +85,6 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&logits_buf_));
 
         allocator_->free((void**)(&key_cache_));
-        if (cache_indirections_[0] != nullptr) {
-            allocator_->free((void**)(&cache_indirections_)[0]);
-        }
 
         allocator_->free((void**)(&tiled_input_ids_buf_));
         allocator_->free((void**)(&tiled_input_lengths_buf_));
@@ -217,11 +214,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
 
     const size_t batch_size = input_tensors->at("input_ids").shape[0];
-
-    // NOTE: Prefix Prompt PreProcessing
-    // get prefix_prompt_weight for each batch --> shape [batch, 1]
-    // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
-    int seq_len = input_tensors->at("input_ids").shape[1];
+    int          seq_len    = input_tensors->at("input_ids").shape[1];
 
     // max cache seq len should include max prefix prompt length as it has k/v states
     const int            start_pos      = input_tensors->at("start_pos").max<int>();
@@ -231,12 +224,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     sync_check_cuda_error();
 
     const DataType            data_type          = getTensorType<T>();
-    const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
-                                                    batch_size,
-                                                    head_num_,
-                                                    size_per_head_ / (16 / sizeof(T)),
-                                                    max_seq_len_,
-                                                    16 / sizeof(T)};
+    const std::vector<size_t> self_k_cache_shape = {
+        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_};
     const std::vector<size_t> self_v_cache_shape = {
         num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_};
 
@@ -250,8 +239,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         stream_);
     sync_check_cuda_error();
 
-    invokeBuildDecoderAttentionMask(
-        input_attention_mask_, tiled_input_lengths_buf_, nullptr, batch_size, seq_len, 0, stream_);
+    invokeLLaMABuildDecoderAttentionMask(
+        input_attention_mask_, tiled_input_lengths_buf_, batch_size, seq_len, start_pos, stream_);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
@@ -269,7 +258,10 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"decoder_input",
          Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}},
         {"attention_mask",
-         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(seq_len)}, input_attention_mask_}},
+         Tensor{MEMORY_GPU,
+                data_type,
+                {batch_size, 1, (size_t)seq_len, (size_t)(start_pos + seq_len)},
+                input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}},
         {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}};
 
@@ -306,7 +298,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                               vocab_size_);
         sync_check_cuda_error();
 
-
         if (std::is_same<T, half>::value) {
             float* output_logits = output_tensors->at("output_logits").getPtr<float>();
             invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index dab7a0509..52f969a74 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -59,21 +59,20 @@ class LLaMA: public BaseLayer {
     void initialize();
 
 protected:
-    T* input_attention_mask_;
-    T* decoder_output_buf_;
-    T* normed_decoder_output_buf_;
+    T* input_attention_mask_ = nullptr;
+    T* decoder_output_buf_ = nullptr;
+    T* normed_decoder_output_buf_ = nullptr;
 
-    T* logits_buf_;
+    T* logits_buf_ = nullptr;
 
-    T*   key_cache_;
-    T*   value_cache_;
-    int* cache_indirections_[2] = {nullptr, nullptr};
+    T*   key_cache_ = nullptr;
+    T*   value_cache_ = nullptr;
 
-    int* tiled_input_ids_buf_;
-    int* tiled_input_lengths_buf_;
+    int* tiled_input_ids_buf_ = nullptr;
+    int* tiled_input_lengths_buf_ = nullptr;
 
-    T* context_decoder_input_buf_;
-    T* context_decoder_output_buf_;
+    T* context_decoder_input_buf_ = nullptr;
+    T* context_decoder_output_buf_ = nullptr;
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 41abb0006..0f99e0887 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -215,6 +215,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const int      start_pos  = input_tensors->at("start_pos").max<int>();
     const DataType data_type  = getTensorType<T>();
     allocateBuffer(batch_size, seq_len);
+    sync_check_cuda_error();
 
     T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
     T*       decoder_output = output_tensors->at("decoder_output").getPtr<T>();
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 9a7cb9168..7595a2a88 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -103,38 +103,24 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
-    }
-
-    ~FTLLaMA() override
-    {
-        ft::ftNcclParamDestroy(tensor_para_);
-        ft::ftNcclParamDestroy(pipeline_para_);
-        cublasLtDestroy(cublasltHandle_);
-        delete cublas_algo_map_;
-        delete cublas_wrapper_mutex_;
-    }
 
-    virtual void
-    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override
-    {
         auto           stream       = at::cuda::getCurrentCUDAStream().stream();
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
         cublasSetStream(cublasHandle, stream);
-        ft::Allocator<ft::AllocatorType::CUDA> allocator =
-            ft::Allocator<ft::AllocatorType::CUDA>(at::cuda::getCurrentCUDAStream().device_index());
-        ft::cublasMMWrapper cublas_wrapper = ft::cublasMMWrapper(
-            cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, &allocator);
+
+        /// ft::Allocator<ft::AllocatorType::CUDA> allocator =
+        // ft::Allocator<ft::AllocatorType::CUDA>(at::cuda::getCurrentCUDAStream().device_index());
+        allocator_      = new ft::Allocator<ft::AllocatorType::TH>();
+        cublas_wrapper_ = new ft::cublasMMWrapper(
+            cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, allocator_);
 
         if (std::is_same<T, half>::value) {
-            cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+            cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
         }
         else if (std::is_same<T, float>::value) {
-            cublas_wrapper.setFP32GemmConfig();
+            cublas_wrapper_->setFP32GemmConfig();
         }
 
-        const size_t request_batch_size = (size_t)input_ids.size(0);
-        const size_t seq_len            = (size_t)input_ids.size(1);
-
         ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
                                                                    ft::getSMVersion(),
                                                                    true,   // remove_padding
@@ -142,44 +128,63 @@ class FTLLaMA: public IFLLaMA {
                                                                    true,   // is_fuse
                                                                    false,  // with_relative_position_bias
                                                                    true);  // causal_mask
-
-        ft::LLaMA<T> llama = ft::LLaMA<T>(num_heads_,
-                                          size_per_head_,
-                                          inter_size_,
-                                          num_layers_,
-                                          vocab_size_,
-                                          rotary_embedding_dim_,
-                                          random_seed_,
-                                          max_seq_len_,
-                                          tensor_para_,
-                                          pipeline_para_,
-                                          stream,
-                                          &cublas_wrapper,
-                                          &allocator,
-                                          false,          // is_free_buffer_after_forward
-                                          &prop_,         // cuda_device_prop
-                                          attention_type  // attention_type
+                                                                           //
+        llama_ = new ft::LLaMA<T>(num_heads_,
+                                  size_per_head_,
+                                  inter_size_,
+                                  num_layers_,
+                                  vocab_size_,
+                                  rotary_embedding_dim_,
+                                  random_seed_,
+                                  max_seq_len_,
+                                  tensor_para_,
+                                  pipeline_para_,
+                                  stream,
+                                  cublas_wrapper_,
+                                  allocator_,
+                                  false,          // is_free_buffer_after_forward
+                                  &prop_,         // cuda_device_prop
+                                  attention_type  // attention_type
         );
+    }
+
+    ~FTLLaMA() override
+    {
+        delete llama_;
+        delete cublas_wrapper_;
+        delete allocator_;
+
+        ft::ftNcclParamDestroy(tensor_para_);
+        ft::ftNcclParamDestroy(pipeline_para_);
+        cublasLtDestroy(cublasltHandle_);
+        delete cublas_algo_map_;
+        delete cublas_wrapper_mutex_;
+    }
+
+    virtual void
+    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override
+    {
+
+        const size_t batch_size = (size_t)input_ids.size(0);
+        const size_t seq_len    = (size_t)input_ids.size(1);
 
         std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"input_ids",
-             ft::Tensor{ft::MEMORY_GPU,
-                        ft::TYPE_INT32,
-                        std::vector<size_t>{request_batch_size, seq_len},
-                        get_ptr<int>(input_ids)}},
-            {"input_lengths",
              ft::Tensor{
-                 ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{request_batch_size}, get_ptr<int>(input_lengths)}},
+                 ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size, seq_len}, get_ptr<int>(input_ids)}},
+            {"input_lengths",
+             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
             {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &start_pos}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"output_logits",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_FP32,
-                        std::vector<size_t>{request_batch_size, seq_len, vocab_size_},
+                        std::vector<size_t>{batch_size, seq_len, vocab_size_},
                         get_ptr<float>(output_logits)}}};
+
         try {
-            llama.forward(&output_tensors, &input_tensors, &llama_weights_);
+            llama_->forward(&output_tensors, &input_tensors, &llama_weights_);
         }
         catch (std::runtime_error& error) {
             std::cout << error.what();
@@ -212,6 +217,10 @@ class FTLLaMA: public IFLLaMA {
 
     ft::NcclParam tensor_para_;
     ft::NcclParam pipeline_para_;
+
+    ft::cublasMMWrapper* cublas_wrapper_;
+    ft::IAllocator*      allocator_;
+    ft::LLaMA<T>*        llama_ = nullptr;
 };
 
 class LLaMA: public th::jit::CustomClassHolder {
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
index d23f5ba8e..962a47764 100644
--- a/src/fastertransformer/utils/llama_utils.h
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -130,6 +130,16 @@ static void print_tensor1(T* in, int dim1)
     free(out);
 }
 
+template<typename T>
+static void print_tensor2(T* in, int dim1, int dim2)
+{
+    T* out = (T*)malloc(sizeof(T) * dim1 * dim2);
+    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2, cudaMemcpyDeviceToHost);
+    _print_tensor2(out, dim1, dim2, dim2, 1);
+    std::cout << "\n";
+    free(out);
+}
+
 template<typename T>
 static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start)
 {

From 857d956ccc142117aada942fde111d1a6024a401 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 25 Sep 2023 17:34:51 +0000
Subject: [PATCH 34/55] remove logging

---
 src/fastertransformer/models/llama/LLaMA.cc | 1 -
 src/fastertransformer/th_op/llama/LLaMA.h   | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 29caa3722..f626e9a95 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -199,7 +199,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                        const std::unordered_map<std::string, Tensor>* input_tensors,
                        const LLaMAWeight<T>*                          llama_weights)
 {
-    // Logger::getLogger().setLevel(Logger::Level::DEBUG);
     //
     // input_tensors:
     //      input_ids [batch_size, seq_len]
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 7595a2a88..bf41aa630 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -59,6 +59,8 @@ class FTLLaMA: public IFLLaMA {
         pipeline_para_size_(pipeline_para_size),
         weights_(weights)
     {
+        ft::Logger::getLogger().setLevel(ft::Logger::WARNING);
+
         ft::check_cuda_error(cublasLtCreate(&cublasltHandle_));
         cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
         cublas_wrapper_mutex_ = new std::mutex();

From 3074afaaaabc559c4038854ab85f23ca7274bc12 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 26 Sep 2023 03:42:00 +0000
Subject: [PATCH 35/55] remove README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 50f50cab2..72735e507 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Check out FasterTransformer [README.md](FasterTransformerReadME.md)
 mkdir -p FasterTransformer/build
 cd FasterTransformer/build
 git submodule init && git submodule update
-cmake -DSM=xx -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .
+cmake -DSM=70 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON ..
 make -j32
 ```
 

From 80920075a5cecd68255dfd76522da34945f9d365 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Tue, 26 Sep 2023 17:47:12 +0000
Subject: [PATCH 36/55] overlap

---
 .../models/llama/LLaMAContextDecoder.cc       | 40 ++++++++++++++-----
 .../models/llama/LLaMAContextDecoder.h        | 33 ++++++++-------
 src/fastertransformer/th_op/llama/LLaMA.h     | 20 ++++++++--
 3 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 0f99e0887..c4ed10752 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -27,6 +27,9 @@ namespace fastertransformer {
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
+    check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking));
+    check_cuda_error(cudaEventCreate(&kern_event_));
+    check_cuda_error(cudaEventCreate(&comm_event_));
     self_attention_layer_ = new LLaMAContextAttentionLayer<T>(head_num_,
                                                               size_per_head_,
                                                               head_num_,
@@ -59,7 +62,7 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -67,6 +70,10 @@ void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     decoder_layer_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    if (layer_output_buffer_ == nullptr) {
+        layer_output_buffer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(layer_output_buffer_, sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
+    }
     h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
@@ -166,6 +173,10 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
 template<typename T>
 LLaMAContextDecoder<T>::~LLaMAContextDecoder()
 {
+    check_cuda_error(cudaEventDestroy(kern_event_));
+    check_cuda_error(cudaEventDestroy(comm_event_));
+    check_cuda_error(cudaStreamDestroy(comm_stream_));
+
     delete self_attention_layer_;
     delete ffn_layer_;
     freeBuffer();
@@ -200,8 +211,8 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
     // output tensors:
     //      decoder_output [batch_size, seq_len, hidden_dimension],
-    //      key_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head]
-    //      value_cache [num_layer, batch, max_seq_len, local_head_num, size_per_head]
+    //      key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+    //      value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head]
 
     // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'.
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
@@ -210,11 +221,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     FT_CHECK(input_tensors->size() == 4);
     FT_CHECK(output_tensors->size() == 3);
 
-    const int      batch_size = input_tensors->at("decoder_input").shape[0];
-    const int      seq_len    = input_tensors->at("decoder_input").shape[1];
-    const int      start_pos  = input_tensors->at("start_pos").max<int>();
-    const DataType data_type  = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    const int      batch_size  = input_tensors->at("decoder_input").shape[0];
+    const int      seq_len     = input_tensors->at("decoder_input").shape[1];
+    const int      start_pos   = input_tensors->at("start_pos").max<int>();
+    const size_t   max_seq_len = output_tensors->at("key_cache").shape[3];
+    const DataType data_type   = getTensorType<T>();
+    allocateBuffer(batch_size, seq_len, max_seq_len);
     sync_check_cuda_error();
 
     T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
@@ -257,6 +269,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
 
         if (l == 0 && is_unpadded_mha) {
+            check_cuda_error(cudaEventSynchronize(kern_event_));
             invokeRemovePadding(
                 decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
             sync_check_cuda_error();
@@ -354,8 +367,17 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
             && pipeline_para_.world_size_ > 1) {
             int data_size = h_token_num * hidden_units_;
-            ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
+            check_cuda_error(cudaEventSynchronize(comm_event_));
+            check_cuda_error(cudaMemcpyAsync(
+                layer_output_buffer_, layer_output, sizeof(T) * data_size, cudaMemcpyDeviceToDevice, stream_));
+            check_cuda_error(cudaEventRecord(kern_event_, stream_));
+            check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_));
+            ftNcclSend(layer_output_buffer_, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_);
             sync_check_cuda_error();
+            check_cuda_error(cudaEventRecord(comm_event_, comm_stream_));
+
+            //ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_);
+            //sync_check_cuda_error();
         }
 
         if ((l == num_layer_ - 1) && is_unpadded_mha) {
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index cb6736f02..7a4866ddc 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -43,6 +43,10 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t rotary_embedding_dim_;
     float  layernorm_eps_;
 
+    cudaEvent_t  kern_event_;
+    cudaEvent_t  comm_event_;
+    cudaStream_t comm_stream_;
+
     // calculated data
     size_t hidden_units_;
 
@@ -56,7 +60,7 @@ class LLaMAContextDecoder: public BaseLayer {
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -67,6 +71,7 @@ class LLaMAContextDecoder: public BaseLayer {
     void initialize();
 
 protected:
+    T*      layer_output_buffer_    = nullptr;
     T*      decoder_normed_input_   = nullptr;
     T*      self_attn_output_       = nullptr;
     T*      decoder_layer_output_   = nullptr;
@@ -75,19 +80,19 @@ class LLaMAContextDecoder: public BaseLayer {
     int*    cu_seqlens_             = nullptr;
 
 public:
-    LLaMAContextDecoder(size_t                              head_num,
-                        size_t                              size_per_head,
-                        size_t                              inter_size,
-                        size_t                              num_layer,
-                        size_t                              rotary_embedding_dim,
-                        float                               layernorm_eps,
-                        NcclParam                           pipeline_para,
-                        cudaStream_t                        stream,
-                        cublasMMWrapper*                    cublas_wrapper,
-                        IAllocator*                         allocator,
-                        bool                                is_free_buffer_after_forward,
-                        bool                                is_qk_buf_float,
-                        AttentionType                       attention_type            = AttentionType::FUSED_MHA);
+    LLaMAContextDecoder(size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           inter_size,
+                        size_t           num_layer,
+                        size_t           rotary_embedding_dim,
+                        float            layernorm_eps,
+                        NcclParam        pipeline_para,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward,
+                        bool             is_qk_buf_float,
+                        AttentionType    attention_type = AttentionType::FUSED_MHA);
 
     LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index bf41aa630..597279b92 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -105,16 +105,17 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
+        ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+        ft::check_cuda_error(cudaEventCreate(&event_));
 
-        auto           stream       = at::cuda::getCurrentCUDAStream().stream();
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
-        cublasSetStream(cublasHandle, stream);
+        cublasSetStream(cublasHandle, stream_);
 
         /// ft::Allocator<ft::AllocatorType::CUDA> allocator =
         // ft::Allocator<ft::AllocatorType::CUDA>(at::cuda::getCurrentCUDAStream().device_index());
         allocator_      = new ft::Allocator<ft::AllocatorType::TH>();
         cublas_wrapper_ = new ft::cublasMMWrapper(
-            cublasHandle, cublasltHandle_, stream, cublas_algo_map_, cublas_wrapper_mutex_, allocator_);
+            cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_);
 
         if (std::is_same<T, half>::value) {
             cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
@@ -141,7 +142,7 @@ class FTLLaMA: public IFLLaMA {
                                   max_seq_len_,
                                   tensor_para_,
                                   pipeline_para_,
-                                  stream,
+                                  stream_,
                                   cublas_wrapper_,
                                   allocator_,
                                   false,          // is_free_buffer_after_forward
@@ -152,6 +153,9 @@ class FTLLaMA: public IFLLaMA {
 
     ~FTLLaMA() override
     {
+        ft::check_cuda_error(cudaEventDestroy(event_));
+        ft::check_cuda_error(cudaStreamDestroy(stream_));
+
         delete llama_;
         delete cublas_wrapper_;
         delete allocator_;
@@ -186,7 +190,12 @@ class FTLLaMA: public IFLLaMA {
                         get_ptr<float>(output_logits)}}};
 
         try {
+            ft::check_cuda_error(cudaEventSynchronize(event_));
             llama_->forward(&output_tensors, &input_tensors, &llama_weights_);
+            ft::check_cuda_error(cudaEventRecord(event_, stream_));
+
+            auto stream = at::cuda::getCurrentCUDAStream().stream();
+            ft::check_cuda_error(cudaStreamWaitEvent(stream, event_));
         }
         catch (std::runtime_error& error) {
             std::cout << error.what();
@@ -210,6 +219,9 @@ class FTLLaMA: public IFLLaMA {
     int64_t      tensor_para_size_;
     int64_t      pipeline_para_size_;
 
+    cudaStream_t stream_;
+    cudaEvent_t event_;
+
     std::vector<th::Tensor> weights_;
     cublasLtHandle_t        cublasltHandle_;
     std::mutex*             cublas_wrapper_mutex_;

From 1187340ad4b78cd4b9d5d4300f96d7812acbbcd6 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Wed, 27 Sep 2023 15:36:49 +0000
Subject: [PATCH 37/55] overlapping versino

---
 src/fastertransformer/models/llama/LLaMAContextDecoder.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index c4ed10752..382999c37 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -269,7 +269,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
 
         if (l == 0 && is_unpadded_mha) {
-            check_cuda_error(cudaEventSynchronize(kern_event_));
             invokeRemovePadding(
                 decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
             sync_check_cuda_error();

From 949c4e7737f412b9419fae8d87b4c4b79f3fca04 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 29 Sep 2023 00:34:43 +0000
Subject: [PATCH 38/55] start_pos for each sample

---
 .../kernels/llama_kernels.cu                  | 107 ++++++++++++---
 src/fastertransformer/kernels/llama_kernels.h |  18 ++-
 .../kernels/unfused_attention_kernels.cu      |  87 ++++++-------
 .../kernels/unfused_attention_kernels.h       |   6 +-
 .../LLaMAContextAttentionLayer.cc             |  32 +++--
 src/fastertransformer/models/llama/LLaMA.cc   |  97 +++++++-------
 src/fastertransformer/models/llama/LLaMA.h    |  18 +--
 .../models/llama/LLaMAContextDecoder.cc       | 123 +++++++-----------
 .../models/llama/LLaMAContextDecoder.h        |  17 +--
 src/fastertransformer/th_op/llama/LLaMA.cc    |  19 +--
 src/fastertransformer/th_op/llama/LLaMA.h     |  30 ++++-
 src/fastertransformer/utils/llama_utils.h     |  14 +-
 12 files changed, 320 insertions(+), 248 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 3c753f866..5379eda1d 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -1,25 +1,64 @@
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
 #include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
 
 namespace fastertransformer {
 
+__global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel(
+    int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len)
+{
+    // do cumulated sum
+    int total_seq_len = 0;
+    int cum_offset    = 0;
+    int index         = 0;
+    for (int i = 0; i < batch_size; i++) {
+        const int num_tokens = sequence_length[i];
+        cu_seqlens[i]        = total_seq_len;
+        for (int j = 0; j < num_tokens; j++) {
+            padding_offset[index] = cum_offset;
+            index++;
+        }
+        cum_offset += seq_len - num_tokens;
+        total_seq_len += num_tokens;
+    }
+    cu_seqlens[batch_size] = total_seq_len;
+}
+
+void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int*         padding_offset,
+                                             int*         cu_seqlens,
+                                             const int*   input_lengths,
+                                             const int    batch_size,
+                                             const int    seq_len,
+                                             cudaStream_t stream)
+{
+    LLaMAgetPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
+        padding_offset, cu_seqlens, input_lengths, batch_size, seq_len);
+}
+
 template<typename T>
-__global__ void LLaMAbuildDecoderAttentionMaskKernel(
-    T* attention_mask, const int* sequence_lengths, const int batch_size, const int seq_len, const int start_pos)
+__global__ void LLaMAbuildDecoderAttentionMaskKernel(T*         attention_mask,
+                                                     const int* sequence_lengths,
+                                                     const int* context_lengths,
+                                                     const int  batch_size,
+                                                     const int  seq_len,
+                                                     const int  max_length)
 {
-    // sequence_lengths:
-    // [batch_size]
     // attention_mask:
-    // [batch_size, 1, seq_len, seq_len + start_pos]
-    const int max_length        = seq_len + start_pos;
+    // [batch_size, 1, seq_len, max_length]
+    const int batch_idx         = blockIdx.x;
     const int mask_size_per_seq = seq_len * max_length;
-    attention_mask += blockIdx.x * mask_size_per_seq;
-    const int seq_length = sequence_lengths[blockIdx.x];
+    attention_mask += batch_idx * mask_size_per_seq;
+    const int context_length = context_lengths[batch_idx];
+    const int length         = sequence_lengths[batch_idx];
+    const int offset         = max_length - length;
 
     for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
         int row_id = i / max_length;
         int col_id = i % max_length;
-        if (row_id < seq_length && col_id <= (row_id + start_pos)) {
+        if (row_id < length && col_id <= (row_id + context_length)) {
             attention_mask[i] = (T)(1.0f);
         }
         else {
@@ -30,27 +69,59 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel(
 
 template<typename T>
 void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
-                                          const int*   sequence_lengths,
+                                          const int*   sequence_length,
+                                          const int*   context_lengths,
                                           const int    batch_size,
                                           const int    seq_len,
-                                          const int    start_pos,
+                                          const int    max_length,
                                           cudaStream_t stream)
 {
-    LLaMAbuildDecoderAttentionMaskKernel<T>
-        <<<batch_size, 256, 0, stream>>>(attention_mask, sequence_lengths, batch_size, seq_len, start_pos);
+    LLaMAbuildDecoderAttentionMaskKernel<T><<<batch_size, 256, 0, stream>>>(
+        attention_mask, sequence_length, context_lengths, batch_size, seq_len, max_length);
 }
 
 template void invokeLLaMABuildDecoderAttentionMask(float*       attention_mask,
-                                                   const int*   sequence_lengths,
+                                                   const int*   sequence_length,
+                                                   const int*   context_lengths,
                                                    const int    batch_size,
                                                    const int    seq_len,
-                                                   const int    start_pos,
+                                                   const int    max_length,
                                                    cudaStream_t stream);
 
 template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
-                                                   const int*   sequence_lengths,
+                                                   const int*   sequence_length,
+                                                   const int*   context_lengths,
                                                    const int    batch_size,
                                                    const int    seq_len,
-                                                   const int    start_pos,
+                                                   const int    max_length,
                                                    cudaStream_t stream);
+
+template<typename T>
+__global__ void LLaMACopyKernel(T* dst, T* src, const int count)
+{
+
+    int           idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
+    if (idx * X_ELEMS >= count) {
+        return;
+    }
+
+    auto v_dst = reinterpret_cast<uint4*>(dst);
+    auto v_src = reinterpret_cast<uint4*>(src);
+    v_dst[idx] = v_src[idx];
+}
+
+template<typename T>
+void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    assert(count % x == 0);
+    int grid_sz = (count / x + block_sz - 1) / block_sz;
+    LLaMACopyKernel<<<grid_sz, block_sz, 0, stream>>>(dst, src, count);
+}
+
+template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream);
+template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream);
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index 320b5624f..a218b40d1 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -1,15 +1,25 @@
 #pragma once
 
-
 #include "src/fastertransformer/utils/cuda_fp8_utils.h"
 #include "src/fastertransformer/utils/memory_utils.h"
 namespace fastertransformer {
 
+void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int*         padding_offset,
+                                             int*         cu_seqlens,
+                                             const int*   input_lengths,
+                                             const int    batch_size,
+                                             const int    seq_len,
+                                             cudaStream_t stream);
+
 template<typename T>
 void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
-                                          const int*   sequence_lengths,
+                                          const int*   sequence_length,
+                                          const int*   context_lengths,
                                           const int    batch_size,
                                           const int    seq_len,
-                                          const int    start_pos,
+                                          const int    max_length,
                                           cudaStream_t stream);
-} // namespace fastertransformer
+
+template<typename T>
+void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 1010ca3f3..134d63921 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1592,7 +1592,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
                                                          const int  head_num,
                                                          const int  size_per_head,
                                                          const int  rotary_embedding_dim,
-                                                         const int  start_pos)
+                                                         const int* start_pos)
 {
     constexpr int vec_size         = Vec_t<T>::size;
     using Vec_t                    = typename Vec_t<T>::Type;
@@ -1622,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
         v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
     }
 
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos + seq_idx);
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos[batch_idx] + seq_idx);
 
     const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
                            + seq_idx * size_per_head + tidx * vec_size;
@@ -1649,7 +1649,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    head_num,
                                          const int    size_per_head,
                                          const int    rotary_embedding_dim,
-                                         const int    start_pos,
+                                         const int*   start_pos,
                                          cudaStream_t stream)
 {
     dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
@@ -1678,7 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
-                                                  const int    start_pos,
+                                                  const int*   start_pos,
                                                   cudaStream_t stream);
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
@@ -1692,7 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
-                                                  const int    start_pos,
+                                                  const int*   start_pos,
                                                   cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
@@ -1706,7 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
                                                   const int      head_num,
                                                   const int      size_per_head,
                                                   const int      rotary_embedding_dim,
-                                                  const int      start_pos,
+                                                  const int*     start_pos,
                                                   cudaStream_t   stream);
 #endif
 
@@ -1898,15 +1898,15 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16);
 #undef INSTANTIATETRANSPOSE4DBATCHMAJOR
 
 template<typename T>
-__global__ void transpose_4d_save_to_cache(T*        k_dst,
-                                           const T*  k_src,
-                                           T*        v_dst,
-                                           const T*  v_src,
-                                           const int head_num,
-                                           const int size_per_head,
-                                           const int seq_len,
-                                           const int max_seq_len,
-                                           const int start_pos)
+__global__ void transpose_4d_save_to_cache(T*         k_dst,
+                                           const T*   k_src,
+                                           T*         v_dst,
+                                           const T*   v_src,
+                                           const int  head_num,
+                                           const int  size_per_head,
+                                           const int  seq_len,
+                                           const int  max_seq_len,
+                                           const int* start_pos)
 {
     // [batch_size, head_num, seq_len, size_per_head]
     const int batch_id = blockIdx.y;
@@ -1915,16 +1915,14 @@ __global__ void transpose_4d_save_to_cache(T*        k_dst,
     // 16 byte loads will handle "x" dimension
     auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * seq_len
                                                   + head_id * size_per_head * seq_len);
-    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
-                                            + head_id * size_per_head * max_seq_len
-                                            + start_pos * size_per_head
-                                            );
+    auto key_dst =
+        reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
+                                 + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head);
     auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * seq_len
                                                   + head_id * size_per_head * seq_len);
-    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
-                                            + head_id * size_per_head * max_seq_len
-                                            + start_pos * size_per_head
-                                            );
+    auto val_dst =
+        reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
+                                 + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head);
 
     // idx is over output dimension L * size_per_head / x for values
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1950,7 +1948,7 @@ void invokeLLaMASaveToCache(T*           k_dst,
                             const int    max_seq_len,
                             const int    size_per_head,
                             const int    local_head_num,
-                            const int    start_pos,
+                            const int*   start_pos,
                             cudaStream_t stream)
 {
     constexpr int block_sz = 128;
@@ -1971,7 +1969,7 @@ void invokeLLaMASaveToCache(T*           k_dst,
                                          const int    max_seq_len,                                                     \
                                          const int    size_per_head,                                                   \
                                          const int    local_head_num,                                                  \
-                                         const int    start_pos,                                                       \
+                                         const int*   start_pos,                                                       \
                                          cudaStream_t stream)
 INSTANTIATESAVETOCACHE(float);
 INSTANTIATESAVETOCACHE(half);
@@ -1981,30 +1979,29 @@ INSTANTIATESAVETOCACHE(__nv_bfloat16);
 #undef INSTANTIATESAVETOCACHE
 
 template<typename T>
-__global__ void transpose_4d_load_from_cache(T*        k_dst,
-                                             const T*  k_src,
-                                             T*        v_dst,
-                                             const T*  v_src,
-                                             const int head_num,
-                                             const int size_per_head,
-                                             const int seq_len,
-                                             const int max_seq_len,
-                                             const int start_pos)
+__global__ void transpose_4d_load_from_cache(T*         k_dst,
+                                             const T*   k_src,
+                                             T*         v_dst,
+                                             const T*   v_src,
+                                             const int  head_num,
+                                             const int  size_per_head,
+                                             const int  seq_len,
+                                             const int  max_seq_len,
+                                             const int  max_length)
 {
-    // [batch_size, head_num, start_pos+seq_len, size_per_head]
+    // [batch_size, head_num, max_length, size_per_head]
     const int batch_id     = blockIdx.y;
     const int head_id      = blockIdx.z;
-    const int real_seq_len = start_pos + seq_len;
 
     // 16 byte loads will handle "x" dimension
     auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * max_seq_len
                                                   + head_id * size_per_head * max_seq_len);
-    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * real_seq_len
-                                            + head_id * size_per_head * real_seq_len);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_length
+                                            + head_id * size_per_head * max_length);
     auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * max_seq_len
                                                   + head_id * size_per_head * max_seq_len);
-    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * real_seq_len
-                                            + head_id * size_per_head * real_seq_len);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_length
+                                            + head_id * size_per_head * max_length);
 
     // idx is over output dimension L * size_per_head / x for values
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -2012,7 +2009,7 @@ __global__ void transpose_4d_load_from_cache(T*        k_dst,
     constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
     const int     size_per_head_div_x = size_per_head / X_ELEMS;
 
-    if (idx >= size_per_head_div_x * real_seq_len) {
+    if (idx >= size_per_head_div_x * max_length) {
         return;
     }
 
@@ -2030,15 +2027,15 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                               const int    max_seq_len,
                               const int    size_per_head,
                               const int    local_head_num,
-                              const int    start_pos,
+                              const int    max_length,
                               cudaStream_t stream)
 {
     constexpr int block_sz = 128;
     constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-    dim3 grid(((start_pos + seq_len) * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+    dim3          grid((max_length * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
 
     transpose_4d_load_from_cache<<<grid, block_sz, 0, stream>>>(
-        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos);
+        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, max_length);
 }
 
 #define INSTANTIATELOADFROMCACHE(T)                                                                                    \
@@ -2051,7 +2048,7 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                                            const int    max_seq_len,                                                   \
                                            const int    size_per_head,                                                 \
                                            const int    local_head_num,                                                \
-                                           const int    start_pos,                                                     \
+                                           const int    max_length,                                                    \
                                            cudaStream_t stream)
 INSTANTIATELOADFROMCACHE(float);
 INSTANTIATELOADFROMCACHE(half);
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 2d4b01dde..c1d85816f 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -125,7 +125,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    head_num,
                                          const int    size_per_head,
                                          const int    rotary_embedding_dim,
-                                         const int    start_pos,
+                                         const int*   start_pos,
                                          cudaStream_t stream);
 
 template<typename T>
@@ -214,7 +214,7 @@ void invokeLLaMASaveToCache(T*           k_dst,
                             const int    max_seq_len,
                             const int    size_per_head,
                             const int    local_head_num,
-                            const int    start_pos,
+                            const int*   start_pos,
                             cudaStream_t stream);
 template<typename T>
 void invokeLLaMALoadFromCache(T*           k_dst,
@@ -226,7 +226,7 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                               const int    max_seq_len,
                               const int    size_per_head,
                               const int    local_head_num,
-                              const int    start_pos,
+                              const int    max_length,
                               cudaStream_t stream);
 
 template<typename T>
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index a9989543a..bdf745562 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -30,12 +30,13 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 {
     // input_tensors:
     //      input_query [token_num, hidden_dimension]
-    //      attention_mask [batch_size, 1, seq_len, seq_len]
+    //      attention_mask [batch_size, 1, seq_len, max_length]
     //      attention_type [1]
     //      layer_id [1], int on cpu
+    //      start_pos, int, [batch_size]
+    //      max_length, int, [batch_size] on cpu
     //      padding_offset, int, [token_num] (optional)
     //      cu_seqlens, int, [batch_size] (optional)
-    //          each element contains ptr with buffer shape[2, head_num_, prompt_length, size_per_head]
 
     // output_tensors:
     //      hidden_features [token_num, hidden_dimension]
@@ -49,8 +50,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     const int  max_seq_len    = (int)(output_tensors->at("key_cache").shape[2]);
     const int  layer_id       = input_tensors->getVal<int>("layer_id");
     const int* padding_offset = input_tensors->getPtr<int>("padding_offset", nullptr);
-    int*       cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    int        start_pos      = input_tensors->at("start_pos").max<int>();
+    const int* cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const int* start_pos      = input_tensors->at("start_pos").getPtr<int>();
+    const int  max_length     = input_tensors->at("max_length").getVal<int>();
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -84,7 +86,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(q_buf_2_, 0, batch_size * seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+        cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
         sync_check_cuda_error();
     }
     invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
@@ -102,6 +104,10 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         stream_);
     sync_check_cuda_error();
 
+    // std::cout << layer_id << "===============\n";
+    // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
+    // std::cout << layer_id << "===============\n";
+
     // key_cache [batch, local_head_num, max_seq_len, size_per_head]
     // value_cache [batch, local_head_num, max_seq_len, size_per_head]
     T* key_cache   = output_tensors->getPtr<T>("key_cache");
@@ -129,17 +135,21 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                              max_seq_len,
                              size_per_head_,
                              head_num_,
-                             start_pos,
+                             max_length,
                              stream_);
 
+    // std::cout << layer_id << "===============\n";
+    // print_tensor4(k_buf_2_, batch_size, head_num_, max_length, size_per_head_);
+    // std::cout << layer_id << "===============\n";
+
     if (attention_type == AttentionType::FUSED_MHA) {
         dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
         dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
     }
     else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = seq_len;              // q length
-        const int            attention_seq_len_2 = start_pos + seq_len;  // kv length
+        const int            attention_seq_len_1 = seq_len;     // q length
+        const int            attention_seq_len_2 = max_length;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
 
         //
@@ -221,6 +231,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             sync_check_cuda_error();
             POP_RANGE;
         }
+        //std::cout << layer_id << "===============\n";
+        //print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2);
+        //std::cout << layer_id << "===============\n";
 
         PUSH_RANGE("QK*V batch gemm");
         cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
@@ -243,6 +256,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
                                             batch_size * head_num_);
         sync_check_cuda_error();
+        //        std::cout << layer_id << "===============\n";
+        //        print_tensor4(qkv_buf_2_, batch_size, head_num_, attention_seq_len_1, size_per_head_);
+        //        std::cout << layer_id << "===============\n";
 
         // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
         if (padding_offset == nullptr) {
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index f626e9a95..52f8c76f0 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -58,22 +58,19 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
 
     input_attention_mask_ =
         (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false));
-    normed_decoder_output_buf_ =
-        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
 
     key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
     value_cache_ = key_cache_ + self_cache_size;
 
-    tiled_input_ids_buf_ =
-        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batch_size * seq_len, false));
-    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batch_size, false));
-
     context_decoder_input_buf_ =
         (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
+
     is_allocate_buffer_ = true;
 }
 
@@ -82,16 +79,10 @@ void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
         allocator_->free((void**)(&input_attention_mask_));
-        allocator_->free((void**)(&logits_buf_));
-
         allocator_->free((void**)(&key_cache_));
-
-        allocator_->free((void**)(&tiled_input_ids_buf_));
-        allocator_->free((void**)(&tiled_input_lengths_buf_));
-
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
-
+        allocator_->free((void**)(&logits_buf_));
         is_allocate_buffer_ = false;
     }
 }
@@ -203,78 +194,86 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // input_tensors:
     //      input_ids [batch_size, seq_len]
     //      input_lengths [batch_size]
-    //      start_pos [1] int on cpu
+    //      start_pos [batch_size]
+    //      num_tokens [1] int on cpu
+    //      max_length [1] int on cpu
 
     // output_tensors:
     //      output_logits [batch_size, seq_len, vocab_size]
 
-    FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3");
+    FT_CHECK_WITH_INFO(input_tensors->size() == 5, "input_tensors->size() == 5");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
 
-    const size_t batch_size = input_tensors->at("input_ids").shape[0];
-    int          seq_len    = input_tensors->at("input_ids").shape[1];
-
-    // max cache seq len should include max prefix prompt length as it has k/v states
-    const int            start_pos      = input_tensors->at("start_pos").max<int>();
-    const cudaDataType_t gemm_data_type = getCudaDataType<T>();
+    const DataType data_type     = getTensorType<T>();
+    const size_t   batch_size    = input_tensors->at("input_ids").shape[0];
+    const int      seq_len       = input_tensors->at("input_ids").shape[1];
+    const int*     input_ids     = input_tensors->at("input_ids").getPtr<int>();
+    const int*     start_pos     = input_tensors->at("start_pos").getPtr<int>();
+    const int*     input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+    const int      num_tokens    = input_tensors->at("num_tokens").getVal<int>(0);
+    const int      max_length    = input_tensors->at("max_length").getVal<int>(0);
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
 
-    const DataType            data_type          = getTensorType<T>();
-    const std::vector<size_t> self_k_cache_shape = {
-        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_};
-    const std::vector<size_t> self_v_cache_shape = {
-        num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_};
-
-    invokeTileGptInputs(tiled_input_ids_buf_,
-                        tiled_input_lengths_buf_,
-                        input_tensors->at("input_ids").getPtr<int>(),
-                        input_tensors->at("input_lengths").getPtr<int>(),
-                        batch_size,
-                        1,
-                        seq_len,
-                        stream_);
-    sync_check_cuda_error();
-
     invokeLLaMABuildDecoderAttentionMask(
-        input_attention_mask_, tiled_input_lengths_buf_, batch_size, seq_len, start_pos, stream_);
+        input_attention_mask_, input_lengths, start_pos, batch_size, seq_len, max_length, stream_);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
         invokeInputIdsEmbeddingLookup(context_decoder_input_buf_,
                                       llama_weights->pre_decoder_embedding_table,
-                                      tiled_input_ids_buf_,
+                                      input_ids,
                                       seq_len,
                                       batch_size,
                                       hidden_units_,
                                       stream_);
         sync_check_cuda_error();
     }
+    else {
+        int data_size = batch_size * seq_len * hidden_units_;
+        ftNcclRecv(context_decoder_input_buf_, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+        sync_check_cuda_error();
+    }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
          Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}},
         {"attention_mask",
-         Tensor{MEMORY_GPU,
-                data_type,
-                {batch_size, 1, (size_t)seq_len, (size_t)(start_pos + seq_len)},
-                input_attention_mask_}},
-        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, tiled_input_lengths_buf_}},
-        {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}};
+         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
+        {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, start_pos}},
+        {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}},
+        {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}};
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
          Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}},
-        {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
-        {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
+        {"key_cache",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                key_cache_}},
+        {"value_cache",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                value_cache_}}};
 
     llama_context_decoder_->forward(
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-    if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
+    if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
+        ftNcclSend(context_decoder_output_buf_,
+                   batch_size * seq_len * hidden_units_,
+                   pipeline_para_.rank_ + 1,
+                   pipeline_para_,
+                   stream_);
+        sync_check_cuda_error();
+    }
+    else {
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 52f969a74..62af1a6d2 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -59,19 +59,15 @@ class LLaMA: public BaseLayer {
     void initialize();
 
 protected:
-    T* input_attention_mask_ = nullptr;
-    T* decoder_output_buf_ = nullptr;
-    T* normed_decoder_output_buf_ = nullptr;
-
-    T* logits_buf_ = nullptr;
+    T*   input_attention_mask_ = nullptr;
+    T*   key_cache_            = nullptr;
+    T*   value_cache_          = nullptr;
 
-    T*   key_cache_ = nullptr;
-    T*   value_cache_ = nullptr;
-
-    int* tiled_input_ids_buf_ = nullptr;
-    int* tiled_input_lengths_buf_ = nullptr;
+    T* decoder_output_buf_        = nullptr;
+    T* normed_decoder_output_buf_ = nullptr;
+    T* logits_buf_                = nullptr;
 
-    T* context_decoder_input_buf_ = nullptr;
+    T* context_decoder_input_buf_  = nullptr;
     T* context_decoder_output_buf_ = nullptr;
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 382999c37..1393767ce 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -17,6 +17,7 @@
 #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
 
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
@@ -27,9 +28,6 @@ namespace fastertransformer {
 template<typename T>
 void LLaMAContextDecoder<T>::initialize()
 {
-    check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking));
-    check_cuda_error(cudaEventCreate(&kern_event_));
-    check_cuda_error(cudaEventCreate(&comm_event_));
     self_attention_layer_ = new LLaMAContextAttentionLayer<T>(head_num_,
                                                               size_per_head_,
                                                               head_num_,
@@ -64,20 +62,16 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 template<typename T>
 void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
 {
+    padding_offset_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
+    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     self_attn_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     decoder_layer_output_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    if (layer_output_buffer_ == nullptr) {
-        layer_output_buffer_ = reinterpret_cast<T*>(
-            allocator_->reMalloc(layer_output_buffer_, sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
-    }
-    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
-    padding_offset_ =
-        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
-    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
     is_allocate_buffer_ = true;
 }
 
@@ -88,9 +82,8 @@ void LLaMAContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&decoder_normed_input_));
         allocator_->free((void**)(&self_attn_output_));
         allocator_->free((void**)(&decoder_layer_output_));
-        allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
-        allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        allocator_->free((void**)(&padding_offset_));
         is_allocate_buffer_ = false;
     }
 }
@@ -173,10 +166,6 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
 template<typename T>
 LLaMAContextDecoder<T>::~LLaMAContextDecoder()
 {
-    check_cuda_error(cudaEventDestroy(kern_event_));
-    check_cuda_error(cudaEventDestroy(comm_event_));
-    check_cuda_error(cudaStreamDestroy(comm_stream_));
-
     delete self_attention_layer_;
     delete ffn_layer_;
     freeBuffer();
@@ -207,7 +196,9 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      decoder_input [batch_size, seq_len, hidden_dimension],
     //      attention_mask [batch_size, 1, seq_len, seq_len]
     //      input_lengths [batch_size]
-    //      start_pos [1]
+    //      start_pos [batch_size]
+    //      num_tokens [1] int on cpu
+    //      max_length [1] int on cpu
 
     // output tensors:
     //      decoder_output [batch_size, seq_len, hidden_dimension],
@@ -218,14 +209,17 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 4);
+    FT_CHECK(input_tensors->size() == 6);
     FT_CHECK(output_tensors->size() == 3);
-
-    const int      batch_size  = input_tensors->at("decoder_input").shape[0];
-    const int      seq_len     = input_tensors->at("decoder_input").shape[1];
-    const int      start_pos   = input_tensors->at("start_pos").max<int>();
-    const size_t   max_seq_len = output_tensors->at("key_cache").shape[3];
-    const DataType data_type   = getTensorType<T>();
+    const DataType data_type       = getTensorType<T>();
+    const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
+    const int      batch_size      = input_tensors->at("decoder_input").shape[0];
+    const int      seq_len         = input_tensors->at("decoder_input").shape[1];
+    const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int*     start_pos       = input_tensors->at("start_pos").getPtr<int>();
+    const int      max_length      = input_tensors->at("max_length").getVal<int>(0);
+
+    const size_t max_seq_len = output_tensors->at("key_cache").shape[3];
     allocateBuffer(batch_size, seq_len, max_seq_len);
     sync_check_cuda_error();
 
@@ -246,20 +240,15 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
-    AttentionType attention_type  = attention_type_;
-    const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
-
     size_t h_token_num = batch_size * seq_len;
     if (is_unpadded_mha) {
-        const int* input_lengths = input_tensors->at("input_lengths").getPtr<int>();
-        invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
-                                           &h_token_num,
-                                           padding_offset_,
-                                           cu_seqlens_,
-                                           input_lengths,
-                                           batch_size,
-                                           seq_len,
-                                           stream_);
+        invokeLLaMAGetPaddingOffsetAndCuSeqLens(
+            padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
+        sync_check_cuda_error();
+
+        h_token_num = input_tensors->at("num_tokens").getVal<int>();
+
+        invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
         sync_check_cuda_error();
     }
 
@@ -268,30 +257,18 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             continue;
         }
 
-        if (l == 0 && is_unpadded_mha) {
-            invokeRemovePadding(
-                decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
-            sync_check_cuda_error();
-        }
-
         const bool is_final     = false;
         T*         layer_input  = decoder_layer_output_;
         T*         layer_output = decoder_layer_output_;
         if (!is_unpadded_mha) {
-            if (l == 0) {
+            if (isFirstLayerParallelId(l)) {
                 layer_input = decoder_input;
             }
-            if (l == num_layer_ - 1) {
+            if (isLastLayerParallelId(l)) {
                 layer_output = decoder_output;
             }
         }
 
-        if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
-            int data_size = h_token_num * hidden_units_;
-            ftNcclRecv(layer_input, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
-            sync_check_cuda_error();
-        }
-
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
                                     layer_input,
                                     llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -306,11 +283,14 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
-                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len)},
+                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(max_length)},
                     attention_mask}},
-            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
+            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
             {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
-            {"start_pos", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &start_pos}}};
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}},
+            {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, start_pos}},
+            {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}},
+        };
 
         if (is_unpadded_mha) {
             self_attention_input_tensors.insert("padding_offset",
@@ -333,6 +313,11 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
+        //std::cout << l << "===============\n";
+        //print_tensor2(self_attn_output_, h_token_num, hidden_units_);
+        //std::cout << l << "===============\n";
+
+
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
             decoder_normed_input_,
@@ -362,32 +347,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                               stream_);
 
         sync_check_cuda_error();
+    }
 
-        if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
-            && pipeline_para_.world_size_ > 1) {
-            int data_size = h_token_num * hidden_units_;
-            check_cuda_error(cudaEventSynchronize(comm_event_));
-            check_cuda_error(cudaMemcpyAsync(
-                layer_output_buffer_, layer_output, sizeof(T) * data_size, cudaMemcpyDeviceToDevice, stream_));
-            check_cuda_error(cudaEventRecord(kern_event_, stream_));
-            check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_));
-            ftNcclSend(layer_output_buffer_, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_);
-            sync_check_cuda_error();
-            check_cuda_error(cudaEventRecord(comm_event_, comm_stream_));
-
-            //ftNcclSend(layer_output, data_size, pipeline_para_.rank_ + 1, pipeline_para_, comm_stream_);
-            //sync_check_cuda_error();
-        }
-
-        if ((l == num_layer_ - 1) && is_unpadded_mha) {
-            invokeRebuildPadding(decoder_output,
-                                 decoder_layer_output_,
-                                 padding_offset_,
-                                 h_token_num,
-                                 head_num_ * size_per_head_,
-                                 stream_);
-            sync_check_cuda_error();
-        }
+    if (is_unpadded_mha) {
+        invokeRebuildPadding(
+            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
+        sync_check_cuda_error();
     }
 
     if (is_free_buffer_after_forward_ == true) {
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index 7a4866ddc..d76ff0687 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -43,10 +43,6 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t rotary_embedding_dim_;
     float  layernorm_eps_;
 
-    cudaEvent_t  kern_event_;
-    cudaEvent_t  comm_event_;
-    cudaStream_t comm_stream_;
-
     // calculated data
     size_t hidden_units_;
 
@@ -71,13 +67,12 @@ class LLaMAContextDecoder: public BaseLayer {
     void initialize();
 
 protected:
-    T*      layer_output_buffer_    = nullptr;
-    T*      decoder_normed_input_   = nullptr;
-    T*      self_attn_output_       = nullptr;
-    T*      decoder_layer_output_   = nullptr;
-    size_t* h_pinned_token_num_ptr_ = nullptr;
-    int*    padding_offset_         = nullptr;
-    int*    cu_seqlens_             = nullptr;
+    int*    padding_offset_                    = nullptr;
+    int*    cu_seqlens_                        = nullptr;
+    T*      decoder_normed_input_              = nullptr;
+    T*      self_attn_output_                  = nullptr;
+    T*      decoder_layer_output_              = nullptr;
+    size_t* h_pinned_token_num_ptr_            = nullptr;
 
 public:
     LLaMAContextDecoder(size_t           head_num,
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 45c1e1575..e2b819c4b 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -30,8 +30,7 @@ LLaMA::LLaMA(const int64_t            num_heads,
              const int64_t            tensor_para_size,
              const int64_t            pipeline_para_size,
              const vector<th::Tensor> weights):
-    vocab_size_(vocab_size),
-    st_(weights[0].scalar_type())
+    vocab_size_(vocab_size), st_(weights[0].scalar_type())
 {
     for (auto t : weights) {
         CHECK_INPUT(t, st_);
@@ -74,8 +73,12 @@ LLaMA::~LLaMA()
     delete ftllama;
 }
 
-th::Tensor
-LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos)
+th::Tensor LLaMA::forward(th::Tensor&   output_logits,
+                          th::Tensor&   input_ids,
+                          th::Tensor&   input_lengths,
+                          th::Tensor&   start_pos,
+                          const int64_t num_tokens,
+                          const int64_t max_length)
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -84,11 +87,9 @@ LLaMA::forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t s
     CHECK_CONTIGUOUS(input_lengths);
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
 
-    const int  batch_size    = input_ids.size(0);
-    const int  seq_len       = input_ids.size(1);
-    th::Tensor output_logits = torch::empty({batch_size, seq_len, (long)vocab_size_},
-                                            torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
-    ftllama->forward(output_logits, input_ids, input_lengths, (int)start_pos);
+    const int batch_size = input_ids.size(0);
+    const int seq_len    = input_ids.size(1);
+    ftllama->forward(output_logits, input_ids, input_lengths, start_pos, num_tokens, max_length);
     return output_logits;
 }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 597279b92..237728c1d 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -29,8 +29,12 @@ using std::vector;
 class IFLLaMA {
 public:
     virtual ~IFLLaMA() {}
-    virtual void
-    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) = 0;
+    virtual void forward(th::Tensor& output_logits,
+                         th::Tensor& input_ids,
+                         th::Tensor& input_lengths,
+                         th::Tensor& start_pos,
+                         const int   num_tokens,
+                         const int   max_length) = 0;
 };
 
 template<typename T>
@@ -167,8 +171,12 @@ class FTLLaMA: public IFLLaMA {
         delete cublas_wrapper_mutex_;
     }
 
-    virtual void
-    forward(th::Tensor& output_logits, th::Tensor& input_ids, th::Tensor& input_lengths, const int start_pos) override
+    virtual void forward(th::Tensor& output_logits,
+                         th::Tensor& input_ids,
+                         th::Tensor& input_lengths,
+                         th::Tensor& start_pos,
+                         const int   num_tokens,
+                         const int   max_length) override
     {
 
         const size_t batch_size = (size_t)input_ids.size(0);
@@ -180,7 +188,10 @@ class FTLLaMA: public IFLLaMA {
                  ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size, seq_len}, get_ptr<int>(input_ids)}},
             {"input_lengths",
              ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
-            {"start_pos", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &start_pos}}};
+            {"start_pos",
+             ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(start_pos)}},
+            {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &num_tokens}},
+            {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &max_length}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"output_logits",
@@ -220,7 +231,7 @@ class FTLLaMA: public IFLLaMA {
     int64_t      pipeline_para_size_;
 
     cudaStream_t stream_;
-    cudaEvent_t event_;
+    cudaEvent_t  event_;
 
     std::vector<th::Tensor> weights_;
     cublasLtHandle_t        cublasltHandle_;
@@ -253,7 +264,12 @@ class LLaMA: public th::jit::CustomClassHolder {
 
     ~LLaMA();
 
-    th::Tensor forward(th::Tensor& input_ids, th::Tensor& input_lengths, const int64_t start_pos);
+    th::Tensor forward(th::Tensor& output_logits,
+                       th::Tensor& input_ids,
+                       th::Tensor& input_lengths,
+                       th::Tensor& start_pos,
+                       const int64_t   num_tokens,
+                       const int64_t   max_length);
 
 private:
     const at::ScalarType    st_;
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
index 962a47764..deed71f2c 100644
--- a/src/fastertransformer/utils/llama_utils.h
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -131,15 +131,21 @@ static void print_tensor1(T* in, int dim1)
 }
 
 template<typename T>
-static void print_tensor2(T* in, int dim1, int dim2)
+static void print_tensor2(T* in, int dim1, int dim2, int stride1, int size, int start)
 {
-    T* out = (T*)malloc(sizeof(T) * dim1 * dim2);
-    cudaMemcpy(out, in, sizeof(T) * dim1 * dim2, cudaMemcpyDeviceToHost);
-    _print_tensor2(out, dim1, dim2, dim2, 1);
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor2(&out[start], dim1, dim2, stride1, 1);
     std::cout << "\n";
     free(out);
 }
 
+template<typename T>
+static void print_tensor2(T* in, int dim1, int dim2)
+{
+    print_tensor2(in, dim1, dim2, dim2, dim1 * dim2, 0);
+}
+
 template<typename T>
 static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start)
 {

From 083f3bb74d6535edf794b97be1d9475d09e3f748 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 29 Sep 2023 00:50:46 +0000
Subject: [PATCH 39/55] get back start_pos

---
 .../kernels/llama_kernels.cu                  |  1 -
 .../kernels/unfused_attention_kernels.cu      | 14 ++++-----
 .../LLaMAContextAttentionLayer.cc             | 29 ++++++++++---------
 src/fastertransformer/models/llama/LLaMA.cc   | 22 +++++++-------
 .../models/llama/LLaMAContextDecoder.cc       | 17 ++++++-----
 src/fastertransformer/th_op/llama/LLaMA.cc    |  4 +--
 src/fastertransformer/th_op/llama/LLaMA.h     | 10 +++----
 7 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 5379eda1d..360adeab3 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -53,7 +53,6 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel(T*         attention_mask,
     attention_mask += batch_idx * mask_size_per_seq;
     const int context_length = context_lengths[batch_idx];
     const int length         = sequence_lengths[batch_idx];
-    const int offset         = max_length - length;
 
     for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
         int row_id = i / max_length;
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 134d63921..3259b66df 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1592,7 +1592,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
                                                          const int  head_num,
                                                          const int  size_per_head,
                                                          const int  rotary_embedding_dim,
-                                                         const int* start_pos)
+                                                         const int* context_lengths)
 {
     constexpr int vec_size         = Vec_t<T>::size;
     using Vec_t                    = typename Vec_t<T>::Type;
@@ -1622,7 +1622,7 @@ __global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
         v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
     }
 
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, start_pos[batch_idx] + seq_idx);
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, context_lengths[batch_idx] + seq_idx);
 
     const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
                            + seq_idx * size_per_head + tidx * vec_size;
@@ -1649,7 +1649,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                          const int    head_num,
                                          const int    size_per_head,
                                          const int    rotary_embedding_dim,
-                                         const int*   start_pos,
+                                         const int*   context_lengths,
                                          cudaStream_t stream)
 {
     dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
@@ -1664,7 +1664,7 @@ void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
                                                                          head_num,
                                                                          size_per_head,
                                                                          rotary_embedding_dim,
-                                                                         start_pos);
+                                                                         context_lengths);
 }
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
@@ -1678,7 +1678,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
-                                                  const int*   start_pos,
+                                                  const int*   context_lengths,
                                                   cudaStream_t stream);
 
 template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
@@ -1692,7 +1692,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
                                                   const int    head_num,
                                                   const int    size_per_head,
                                                   const int    rotary_embedding_dim,
-                                                  const int*   start_pos,
+                                                  const int*   context_lengths,
                                                   cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
@@ -1706,7 +1706,7 @@ template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
                                                   const int      head_num,
                                                   const int      size_per_head,
                                                   const int      rotary_embedding_dim,
-                                                  const int*     start_pos,
+                                                  const int*     context_lengths,
                                                   cudaStream_t   stream);
 #endif
 
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index bdf745562..f9d6e2838 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -33,7 +33,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     //      attention_mask [batch_size, 1, seq_len, max_length]
     //      attention_type [1]
     //      layer_id [1], int on cpu
-    //      start_pos, int, [batch_size]
+    //      context_lengths, int, [batch_size]
     //      max_length, int, [batch_size] on cpu
     //      padding_offset, int, [token_num] (optional)
     //      cu_seqlens, int, [batch_size] (optional)
@@ -42,17 +42,18 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     //      hidden_features [token_num, hidden_dimension]
     //      key_cache [batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
+    
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
-    const int  batch_size     = input_tensors->at("attention_mask").shape[0];
-    const int  seq_len        = input_tensors->at("attention_mask").shape[2];
-    const int  max_seq_len    = (int)(output_tensors->at("key_cache").shape[2]);
-    const int  layer_id       = input_tensors->getVal<int>("layer_id");
-    const int* padding_offset = input_tensors->getPtr<int>("padding_offset", nullptr);
-    const int* cu_seqlens     = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    const int* start_pos      = input_tensors->at("start_pos").getPtr<int>();
-    const int  max_length     = input_tensors->at("max_length").getVal<int>();
+    const int  batch_size      = input_tensors->at("attention_mask").shape[0];
+    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
+    const int  max_seq_len     = (int)(output_tensors->at("key_cache").shape[2]);
+    const int  layer_id        = input_tensors->getVal<int>("layer_id");
+    const int* padding_offset  = input_tensors->getPtr<int>("padding_offset", nullptr);
+    const int* cu_seqlens      = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int  max_length      = input_tensors->at("max_length").getVal<int>();
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -100,7 +101,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         head_num_,
                                         size_per_head_,
                                         rotary_embedding_dim_,
-                                        start_pos,
+                                        context_lengths,
                                         stream_);
     sync_check_cuda_error();
 
@@ -121,7 +122,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                            max_seq_len,
                            size_per_head_,
                            head_num_,
-                           start_pos,
+                           context_lengths,
                            stream_);
     sync_check_cuda_error();
     POP_RANGE;
@@ -231,9 +232,9 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             sync_check_cuda_error();
             POP_RANGE;
         }
-        //std::cout << layer_id << "===============\n";
-        //print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2);
-        //std::cout << layer_id << "===============\n";
+        // std::cout << layer_id << "===============\n";
+        // print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2);
+        // std::cout << layer_id << "===============\n";
 
         PUSH_RANGE("QK*V batch gemm");
         cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 52f8c76f0..2a04732a5 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -194,7 +194,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // input_tensors:
     //      input_ids [batch_size, seq_len]
     //      input_lengths [batch_size]
-    //      start_pos [batch_size]
+    //      context_lengths [batch_size]
     //      num_tokens [1] int on cpu
     //      max_length [1] int on cpu
 
@@ -205,20 +205,20 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
 
-    const DataType data_type     = getTensorType<T>();
-    const size_t   batch_size    = input_tensors->at("input_ids").shape[0];
-    const int      seq_len       = input_tensors->at("input_ids").shape[1];
-    const int*     input_ids     = input_tensors->at("input_ids").getPtr<int>();
-    const int*     start_pos     = input_tensors->at("start_pos").getPtr<int>();
-    const int*     input_lengths = input_tensors->at("input_lengths").getPtr<int>();
-    const int      num_tokens    = input_tensors->at("num_tokens").getVal<int>(0);
-    const int      max_length    = input_tensors->at("max_length").getVal<int>(0);
+    const DataType data_type       = getTensorType<T>();
+    const size_t   batch_size      = input_tensors->at("input_ids").shape[0];
+    const int      seq_len         = input_tensors->at("input_ids").shape[1];
+    const int*     input_ids       = input_tensors->at("input_ids").getPtr<int>();
+    const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>(0);
+    const int      max_length      = input_tensors->at("max_length").getVal<int>(0);
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
 
     invokeLLaMABuildDecoderAttentionMask(
-        input_attention_mask_, input_lengths, start_pos, batch_size, seq_len, max_length, stream_);
+        input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, max_length, stream_);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
@@ -243,7 +243,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
-        {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, start_pos}},
+        {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}},
         {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}},
         {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}};
 
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 1393767ce..7946812b9 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -179,7 +179,9 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
     std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
                                                               {"attention_mask", input_tensors->at(1)},
                                                               {"input_lengths", input_tensors->at(2)},
-                                                              {"start_pos", input_tensors->at(3)}};
+                                                              {"context_lengths", input_tensors->at(3)},
+                                                              {"num_tokens", input_tensors->at(4)},
+                                                              {"max_length", input_tensors->at(5)}};
     std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
                                                                {"key_cache", output_tensors->at(1)},
                                                                {"value_cache", output_tensors->at(2)}};
@@ -196,7 +198,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      decoder_input [batch_size, seq_len, hidden_dimension],
     //      attention_mask [batch_size, 1, seq_len, seq_len]
     //      input_lengths [batch_size]
-    //      start_pos [batch_size]
+    //      context_lengths [batch_size]
     //      num_tokens [1] int on cpu
     //      max_length [1] int on cpu
 
@@ -216,7 +218,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const int      batch_size      = input_tensors->at("decoder_input").shape[0];
     const int      seq_len         = input_tensors->at("decoder_input").shape[1];
     const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
-    const int*     start_pos       = input_tensors->at("start_pos").getPtr<int>();
+    const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
     const int      max_length      = input_tensors->at("max_length").getVal<int>(0);
 
     const size_t max_seq_len = output_tensors->at("key_cache").shape[3];
@@ -288,7 +290,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
             {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
             {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}},
-            {"start_pos", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, start_pos}},
+            {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}},
             {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}},
         };
 
@@ -313,10 +315,9 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
-        //std::cout << l << "===============\n";
-        //print_tensor2(self_attn_output_, h_token_num, hidden_units_);
-        //std::cout << l << "===============\n";
-
+        // std::cout << l << "===============\n";
+        // print_tensor2(self_attn_output_, h_token_num, hidden_units_);
+        // std::cout << l << "===============\n";
 
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index e2b819c4b..8aada6aff 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -76,7 +76,7 @@ LLaMA::~LLaMA()
 th::Tensor LLaMA::forward(th::Tensor&   output_logits,
                           th::Tensor&   input_ids,
                           th::Tensor&   input_lengths,
-                          th::Tensor&   start_pos,
+                          th::Tensor&   context_lengths,
                           const int64_t num_tokens,
                           const int64_t max_length)
 {
@@ -89,7 +89,7 @@ th::Tensor LLaMA::forward(th::Tensor&   output_logits,
 
     const int batch_size = input_ids.size(0);
     const int seq_len    = input_ids.size(1);
-    ftllama->forward(output_logits, input_ids, input_lengths, start_pos, num_tokens, max_length);
+    ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, max_length);
     return output_logits;
 }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 237728c1d..365a07dd1 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -32,7 +32,7 @@ class IFLLaMA {
     virtual void forward(th::Tensor& output_logits,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
-                         th::Tensor& start_pos,
+                         th::Tensor& context_lengths,
                          const int   num_tokens,
                          const int   max_length) = 0;
 };
@@ -174,7 +174,7 @@ class FTLLaMA: public IFLLaMA {
     virtual void forward(th::Tensor& output_logits,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
-                         th::Tensor& start_pos,
+                         th::Tensor& context_lengths,
                          const int   num_tokens,
                          const int   max_length) override
     {
@@ -188,8 +188,8 @@ class FTLLaMA: public IFLLaMA {
                  ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size, seq_len}, get_ptr<int>(input_ids)}},
             {"input_lengths",
              ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
-            {"start_pos",
-             ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(start_pos)}},
+            {"context_lengths",
+             ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
             {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &num_tokens}},
             {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &max_length}}};
 
@@ -267,7 +267,7 @@ class LLaMA: public th::jit::CustomClassHolder {
     th::Tensor forward(th::Tensor& output_logits,
                        th::Tensor& input_ids,
                        th::Tensor& input_lengths,
-                       th::Tensor& start_pos,
+                       th::Tensor& context_lengths,
                        const int64_t   num_tokens,
                        const int64_t   max_length);
 

From f08ada9154c1203ff51590424a1d7ace9d08924d Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 29 Sep 2023 13:19:53 +0000
Subject: [PATCH 40/55] debug

---
 .../kernels/unfused_attention_kernels.cu        | 10 +++++-----
 .../LLaMAContextAttentionLayer.cc               | 17 ++---------------
 src/fastertransformer/models/llama/LLaMA.cc     |  1 +
 .../models/llama/LLaMAContextDecoder.cc         |  4 ----
 4 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 3259b66df..c0d673c2a 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1906,7 +1906,7 @@ __global__ void transpose_4d_save_to_cache(T*         k_dst,
                                            const int  size_per_head,
                                            const int  seq_len,
                                            const int  max_seq_len,
-                                           const int* start_pos)
+                                           const int* context_lengths)
 {
     // [batch_size, head_num, seq_len, size_per_head]
     const int batch_id = blockIdx.y;
@@ -1917,12 +1917,12 @@ __global__ void transpose_4d_save_to_cache(T*         k_dst,
                                                   + head_id * size_per_head * seq_len);
     auto key_dst =
         reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
-                                 + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head);
+                                 + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head);
     auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * seq_len
                                                   + head_id * size_per_head * seq_len);
     auto val_dst =
         reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
-                                 + head_id * size_per_head * max_seq_len + start_pos[batch_id] * size_per_head);
+                                 + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head);
 
     // idx is over output dimension L * size_per_head / x for values
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1948,7 +1948,7 @@ void invokeLLaMASaveToCache(T*           k_dst,
                             const int    max_seq_len,
                             const int    size_per_head,
                             const int    local_head_num,
-                            const int*   start_pos,
+                            const int*   context_lengths,
                             cudaStream_t stream)
 {
     constexpr int block_sz = 128;
@@ -1956,7 +1956,7 @@ void invokeLLaMASaveToCache(T*           k_dst,
     dim3          grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
 
     transpose_4d_save_to_cache<<<grid, block_sz, 0, stream>>>(
-        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, start_pos);
+        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, context_lengths);
 }
 
 #define INSTANTIATESAVETOCACHE(T)                                                                                      \
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index f9d6e2838..5557454a5 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -42,7 +42,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     //      hidden_features [token_num, hidden_dimension]
     //      key_cache [batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
-    
+
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
@@ -105,10 +105,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         stream_);
     sync_check_cuda_error();
 
-    // std::cout << layer_id << "===============\n";
-    // print_tensor4(k_buf_2_, batch_size, head_num_, seq_len, size_per_head_);
-    // std::cout << layer_id << "===============\n";
-
     // key_cache [batch, local_head_num, max_seq_len, size_per_head]
     // value_cache [batch, local_head_num, max_seq_len, size_per_head]
     T* key_cache   = output_tensors->getPtr<T>("key_cache");
@@ -139,10 +135,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                              max_length,
                              stream_);
 
-    // std::cout << layer_id << "===============\n";
-    // print_tensor4(k_buf_2_, batch_size, head_num_, max_length, size_per_head_);
-    // std::cout << layer_id << "===============\n";
-
     if (attention_type == AttentionType::FUSED_MHA) {
         dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
         dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
@@ -183,6 +175,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             sync_check_cuda_error();
             POP_RANGE;
 
+
             PUSH_RANGE("softmax");
             MaskedSoftmaxParam<T, float> param;
             param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
@@ -232,9 +225,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             sync_check_cuda_error();
             POP_RANGE;
         }
-        // std::cout << layer_id << "===============\n";
-        // print_tensor4(qk_buf_, batch_size, head_num_, attention_seq_len_1, attention_seq_len_2);
-        // std::cout << layer_id << "===============\n";
 
         PUSH_RANGE("QK*V batch gemm");
         cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
@@ -257,9 +247,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
                                             batch_size * head_num_);
         sync_check_cuda_error();
-        //        std::cout << layer_id << "===============\n";
-        //        print_tensor4(qkv_buf_2_, batch_size, head_num_, attention_seq_len_1, size_per_head_);
-        //        std::cout << layer_id << "===============\n";
 
         // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
         if (padding_offset == nullptr) {
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 2a04732a5..95282ba1c 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -265,6 +265,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
+
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
         ftNcclSend(context_decoder_output_buf_,
                    batch_size * seq_len * hidden_units_,
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 7946812b9..9eaa81758 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -315,10 +315,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                        &self_attention_input_tensors,
                                        &llama_decoder_layer_weight->at(l)->self_attention_weights);
 
-        // std::cout << l << "===============\n";
-        // print_tensor2(self_attn_output_, h_token_num, hidden_units_);
-        // std::cout << l << "===============\n";
-
         invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
             self_attn_output_,
             decoder_normed_input_,

From e5d92df821f27e090e1bb8019e9bdd3b89f499ff Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 29 Sep 2023 19:58:24 +0000
Subject: [PATCH 41/55] chkpt

---
 .../kernels/llama_kernels.cu                  |  65 ++++++++--
 src/fastertransformer/kernels/llama_kernels.h |   7 ++
 .../kernels/unfused_attention_kernels.cu      |  22 ++--
 .../kernels/unfused_attention_kernels.h       |   2 +-
 .../LLaMAContextAttentionLayer.cc             |  31 +++--
 src/fastertransformer/models/llama/LLaMA.cc   | 118 ++++++++++++------
 src/fastertransformer/models/llama/LLaMA.h    |  17 ++-
 .../models/llama/LLaMAContextDecoder.cc       |  60 ++++-----
 src/fastertransformer/th_op/llama/LLaMA.cc    |   8 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |  39 +++---
 10 files changed, 240 insertions(+), 129 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 360adeab3..95700cb1f 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -1,12 +1,59 @@
 #include "src/fastertransformer/kernels/llama_kernels.h"
 #include "src/fastertransformer/utils/cuda_fp8_utils.h"
 
+#include <algorithm>
+
 #include <assert.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
 
+using namespace std;
 namespace fastertransformer {
 
+template<typename T>
+__global__ void LLaMAstart_id_embedding_lookups_kernel(
+    T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_tokens * hidden_units;
+         index += blockDim.x * gridDim.x) {
+
+        // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate
+        // embedding [batch, length, hidden]
+        const int word_index = index / hidden_units;
+        const int col_index  = index % hidden_units;
+        const int input_id   = input_ids[word_index];
+
+        out[index] = embedding_table[input_id * hidden_units + col_index];
+    }
+}
+
+template<typename T>
+void invokeLLaMAInputIdsEmbeddingLookup(T*           out,
+                                        const T*     embedding_table,
+                                        const int*   input_ids,
+                                        const int    num_tokens,
+                                        const int    hidden_units,
+                                        cudaStream_t stream)
+{
+    dim3 grid(min(num_tokens, 65536));
+    dim3 block(min(hidden_units, 512));
+    LLaMAstart_id_embedding_lookups_kernel<T>
+        <<<grid, block, 0, stream>>>(out, embedding_table, input_ids, num_tokens, hidden_units);
+}
+
+template void invokeLLaMAInputIdsEmbeddingLookup(float*       out,
+                                                 const float* embedding_table,
+                                                 const int*   input_ids,
+                                                 const int    num_tokens,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+template void invokeLLaMAInputIdsEmbeddingLookup(half*        out,
+                                                 const half*  embedding_table,
+                                                 const int*   input_ids,
+                                                 const int    num_tokens,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+
 __global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel(
     int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len)
 {
@@ -44,19 +91,19 @@ __global__ void LLaMAbuildDecoderAttentionMaskKernel(T*         attention_mask,
                                                      const int* context_lengths,
                                                      const int  batch_size,
                                                      const int  seq_len,
-                                                     const int  max_length)
+                                                     const int  attn_len)
 {
     // attention_mask:
-    // [batch_size, 1, seq_len, max_length]
+    // [batch_size, 1, seq_len, attn_len]
     const int batch_idx         = blockIdx.x;
-    const int mask_size_per_seq = seq_len * max_length;
+    const int mask_size_per_seq = seq_len * attn_len;
     attention_mask += batch_idx * mask_size_per_seq;
     const int context_length = context_lengths[batch_idx];
     const int length         = sequence_lengths[batch_idx];
 
     for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
-        int row_id = i / max_length;
-        int col_id = i % max_length;
+        int row_id = i / attn_len;
+        int col_id = i % attn_len;
         if (row_id < length && col_id <= (row_id + context_length)) {
             attention_mask[i] = (T)(1.0f);
         }
@@ -72,11 +119,11 @@ void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
                                           const int*   context_lengths,
                                           const int    batch_size,
                                           const int    seq_len,
-                                          const int    max_length,
+                                          const int    attn_len,
                                           cudaStream_t stream)
 {
     LLaMAbuildDecoderAttentionMaskKernel<T><<<batch_size, 256, 0, stream>>>(
-        attention_mask, sequence_length, context_lengths, batch_size, seq_len, max_length);
+        attention_mask, sequence_length, context_lengths, batch_size, seq_len, attn_len);
 }
 
 template void invokeLLaMABuildDecoderAttentionMask(float*       attention_mask,
@@ -84,7 +131,7 @@ template void invokeLLaMABuildDecoderAttentionMask(float*       attention_mask,
                                                    const int*   context_lengths,
                                                    const int    batch_size,
                                                    const int    seq_len,
-                                                   const int    max_length,
+                                                   const int    attn_len,
                                                    cudaStream_t stream);
 
 template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
@@ -92,7 +139,7 @@ template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
                                                    const int*   context_lengths,
                                                    const int    batch_size,
                                                    const int    seq_len,
-                                                   const int    max_length,
+                                                   const int    attn_len,
                                                    cudaStream_t stream);
 
 template<typename T>
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index a218b40d1..2d1c9592e 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -19,6 +19,13 @@ void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
                                           const int    seq_len,
                                           const int    max_length,
                                           cudaStream_t stream);
+template<typename T>
+void invokeLLaMAInputIdsEmbeddingLookup(T*           from_tensor,
+                                        const T*     embedding_table,
+                                        const int*   input_ids,
+                                        const int    num_tokens,
+                                        const int    hidden_units,
+                                        cudaStream_t stream);
 
 template<typename T>
 void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index c0d673c2a..2f867186e 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1987,21 +1987,21 @@ __global__ void transpose_4d_load_from_cache(T*         k_dst,
                                              const int  size_per_head,
                                              const int  seq_len,
                                              const int  max_seq_len,
-                                             const int  max_length)
+                                             const int  attn_len)
 {
-    // [batch_size, head_num, max_length, size_per_head]
+    // [batch_size, head_num, attn_len, size_per_head]
     const int batch_id     = blockIdx.y;
     const int head_id      = blockIdx.z;
 
     // 16 byte loads will handle "x" dimension
     auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * max_seq_len
                                                   + head_id * size_per_head * max_seq_len);
-    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_length
-                                            + head_id * size_per_head * max_length);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * attn_len
+                                            + head_id * size_per_head * attn_len);
     auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * max_seq_len
                                                   + head_id * size_per_head * max_seq_len);
-    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_length
-                                            + head_id * size_per_head * max_length);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * attn_len
+                                            + head_id * size_per_head * attn_len);
 
     // idx is over output dimension L * size_per_head / x for values
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -2009,7 +2009,7 @@ __global__ void transpose_4d_load_from_cache(T*         k_dst,
     constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
     const int     size_per_head_div_x = size_per_head / X_ELEMS;
 
-    if (idx >= size_per_head_div_x * max_length) {
+    if (idx >= size_per_head_div_x * attn_len) {
         return;
     }
 
@@ -2027,15 +2027,15 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                               const int    max_seq_len,
                               const int    size_per_head,
                               const int    local_head_num,
-                              const int    max_length,
+                              const int    attn_len,
                               cudaStream_t stream)
 {
     constexpr int block_sz = 128;
     constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-    dim3          grid((max_length * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+    dim3          grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
 
     transpose_4d_load_from_cache<<<grid, block_sz, 0, stream>>>(
-        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, max_length);
+        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, attn_len);
 }
 
 #define INSTANTIATELOADFROMCACHE(T)                                                                                    \
@@ -2048,7 +2048,7 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                                            const int    max_seq_len,                                                   \
                                            const int    size_per_head,                                                 \
                                            const int    local_head_num,                                                \
-                                           const int    max_length,                                                    \
+                                           const int    attn_len,                                                    \
                                            cudaStream_t stream)
 INSTANTIATELOADFROMCACHE(float);
 INSTANTIATELOADFROMCACHE(half);
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index c1d85816f..52fa0f053 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -226,7 +226,7 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                               const int    max_seq_len,
                               const int    size_per_head,
                               const int    local_head_num,
-                              const int    max_length,
+                              const int    attn_len,
                               cudaStream_t stream);
 
 template<typename T>
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 5557454a5..0c2307fc8 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -29,17 +29,17 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                             const AttentionWeight<T>* attention_weights)
 {
     // input_tensors:
-    //      input_query [token_num, hidden_dimension]
-    //      attention_mask [batch_size, 1, seq_len, max_length]
+    //      input_query [num_tokens, hidden_dimension]
+    //      attention_mask [batch_size, 1, seq_len, attn_len]
     //      attention_type [1]
     //      layer_id [1], int on cpu
     //      context_lengths, int, [batch_size]
-    //      max_length, int, [batch_size] on cpu
-    //      padding_offset, int, [token_num] (optional)
+    //      attn_len, int, [batch_size] on cpu
+    //      padding_offset, int, [num_tokens] (optional)
     //      cu_seqlens, int, [batch_size] (optional)
 
     // output_tensors:
-    //      hidden_features [token_num, hidden_dimension]
+    //      hidden_features [num_tokens, hidden_dimension]
     //      key_cache [batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
 
@@ -47,13 +47,13 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
     const int  batch_size      = input_tensors->at("attention_mask").shape[0];
-    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
     const int  max_seq_len     = (int)(output_tensors->at("key_cache").shape[2]);
     const int  layer_id        = input_tensors->getVal<int>("layer_id");
     const int* padding_offset  = input_tensors->getPtr<int>("padding_offset", nullptr);
     const int* cu_seqlens      = input_tensors->getPtr<int>("cu_seqlens", nullptr);
     const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
-    const int  max_length      = input_tensors->at("max_length").getVal<int>();
+    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
+    const int  attn_len        = input_tensors->at("attention_mask").shape[3];
 
     T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
     T* attention_input = input_tensors->at("input_query").getPtr<T>();
@@ -68,14 +68,14 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     POP_RANGE;
     sync_check_cuda_error();
 
-    const int m = input_tensors->at("input_query").shape[0];
+    const int num_tokens = input_tensors->at("input_query").shape[0];
 
     PUSH_RANGE("qkv_gemm");
 
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
                           CUBLAS_OP_N,
                           3 * hidden_units_,  // n
-                          m,
+                          num_tokens,
                           hidden_units_,  // k
                           attention_weights->query_weight.kernel,
                           3 * hidden_units_,  // n
@@ -97,7 +97,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         padding_offset,
                                         batch_size,
                                         seq_len,
-                                        m,
+                                        num_tokens,
                                         head_num_,
                                         size_per_head_,
                                         rotary_embedding_dim_,
@@ -132,7 +132,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                              max_seq_len,
                              size_per_head_,
                              head_num_,
-                             max_length,
+                             attn_len,
                              stream_);
 
     if (attention_type == AttentionType::FUSED_MHA) {
@@ -141,8 +141,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     }
     else {
         const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = seq_len;     // q length
-        const int            attention_seq_len_2 = max_length;  // kv length
+        const int            attention_seq_len_1 = seq_len;   // q length
+        const int            attention_seq_len_2 = attn_len;  // kv length
         const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
 
         //
@@ -175,7 +175,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
             sync_check_cuda_error();
             POP_RANGE;
 
-
             PUSH_RANGE("softmax");
             MaskedSoftmaxParam<T, float> param;
             param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
@@ -264,7 +263,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
         else {
             invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
                                                      qkv_buf_3_,
-                                                     m,
+                                                     num_tokens,
                                                      batch_size,
                                                      attention_seq_len_1,
                                                      head_num_,
@@ -283,7 +282,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     cublas_wrapper_->Gemm(CUBLAS_OP_N,
                           CUBLAS_OP_N,
                           hidden_units_,
-                          m,
+                          num_tokens,
                           hidden_units_,
                           attention_weights->attention_output_weight.kernel,
                           hidden_units_,
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 95282ba1c..fb71c966f 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -29,6 +29,11 @@ namespace fastertransformer {
 template<typename T>
 void LLaMA<T>::initialize()
 {
+    check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking));
+    for (int i = 0; i < num_buffers_; ++i) {
+        check_cuda_error(cudaEventCreate(&kern_event_[i]));
+        check_cuda_error(cudaEventCreate(&comm_event_[i]));
+    }
     llama_context_decoder_ = new LLaMAContextDecoder<T>(head_num_,
                                                         size_per_head_,
                                                         inter_size_,
@@ -67,6 +72,11 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
+    for (int i = 0; i < num_buffers_; ++i) {
+        context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc(
+            context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
+    }
+
     normed_decoder_output_buf_ =
         (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
@@ -82,6 +92,9 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&key_cache_));
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
+        for (int i = 0; i < num_buffers_; ++i) {
+            allocator_->free((void**)(&context_decoder_output_buf_clone_[i]));
+        }
         allocator_->free((void**)(&logits_buf_));
         is_allocate_buffer_ = false;
     }
@@ -173,6 +186,12 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
 template<typename T>
 LLaMA<T>::~LLaMA()
 {
+    check_cuda_error(cudaStreamDestroy(comm_stream_));
+    for (int i = 0; i < num_buffers_; ++i) {
+        check_cuda_error(cudaEventDestroy(kern_event_[i]));
+        check_cuda_error(cudaEventDestroy(comm_event_[i]));
+    }
+
     delete llama_context_decoder_;
     freeBuffer();
 }
@@ -192,64 +211,66 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 {
     //
     // input_tensors:
-    //      input_ids [batch_size, seq_len]
+    //      input_ids [num_tokens]
     //      input_lengths [batch_size]
     //      context_lengths [batch_size]
     //      num_tokens [1] int on cpu
-    //      max_length [1] int on cpu
+    //      seq_len [1] int on cpu
+    //      attn_len [1] int on cpu
 
     // output_tensors:
-    //      output_logits [batch_size, seq_len, vocab_size]
+    //      output_logits [num_tokens, vocab_size]
 
-    FT_CHECK_WITH_INFO(input_tensors->size() == 5, "input_tensors->size() == 5");
+    FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
 
     const DataType data_type       = getTensorType<T>();
-    const size_t   batch_size      = input_tensors->at("input_ids").shape[0];
-    const int      seq_len         = input_tensors->at("input_ids").shape[1];
+    const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
+    const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
     const int*     input_ids       = input_tensors->at("input_ids").getPtr<int>();
     const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
     const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
-    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>(0);
-    const int      max_length      = input_tensors->at("max_length").getVal<int>(0);
+    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
+    const int      seq_len         = input_tensors->at("seq_len").getVal<int>();
+    const int      attn_len        = input_tensors->at("attn_len").getVal<int>();
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
 
     invokeLLaMABuildDecoderAttentionMask(
-        input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, max_length, stream_);
+        input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_);
     sync_check_cuda_error();
 
     if (pipeline_para_.rank_ == 0) {
-        invokeInputIdsEmbeddingLookup(context_decoder_input_buf_,
-                                      llama_weights->pre_decoder_embedding_table,
-                                      input_ids,
-                                      seq_len,
-                                      batch_size,
-                                      hidden_units_,
-                                      stream_);
+        invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_,
+                                           llama_weights->pre_decoder_embedding_table,
+                                           input_ids,
+                                           num_tokens,
+                                           hidden_units_,
+                                           stream_);
         sync_check_cuda_error();
     }
     else {
-        int data_size = batch_size * seq_len * hidden_units_;
-        ftNcclRecv(context_decoder_input_buf_, data_size, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+        ftNcclRecv(
+            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
         sync_check_cuda_error();
     }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
-         Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_input_buf_}},
+         Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_input_buf_}},
         {"attention_mask",
-         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(max_length)}, input_attention_mask_}},
+         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
         {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}},
         {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}},
-        {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_length}}};
+        {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}},
+        {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}};
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
-         Tensor{MEMORY_GPU, data_type, {batch_size, (size_t)seq_len, hidden_units_}, context_decoder_output_buf_}},
+         Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}},
         {"key_cache",
          Tensor{MEMORY_GPU,
                 data_type,
@@ -265,13 +286,22 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
-        ftNcclSend(context_decoder_output_buf_,
-                   batch_size * seq_len * hidden_units_,
+        buf_no_ = (buf_no_ + 1) % num_buffers_;
+        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
+        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
+                              context_decoder_output_buf_,
+                              num_tokens * hidden_units_,
+                              stream_);
+        sync_check_cuda_error();
+        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
+        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
+                   num_tokens * hidden_units_,
                    pipeline_para_.rank_ + 1,
                    pipeline_para_,
-                   stream_);
+                   comm_stream_);
+        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
         sync_check_cuda_error();
     }
     else {
@@ -279,29 +309,47 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
                                     layernorm_eps_,
-                                    batch_size * seq_len,
+                                    num_tokens,
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
 
+        float  alpha         = 1.0f;
+        float  beta          = 0.0f;
+        float* output_logits = output_tensors->at("output_logits").getPtr<float>();
+        cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F);
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
                               vocab_size_,
-                              batch_size * seq_len,
+                              num_tokens,
                               hidden_units_,
                               llama_weights->post_decoder_embedding.kernel,
                               vocab_size_,
                               normed_decoder_output_buf_,
                               hidden_units_,  // n
-                              logits_buf_,
+                              output_logits,
                               vocab_size_);
         sync_check_cuda_error();
-
-        if (std::is_same<T, half>::value) {
-            float* output_logits = output_tensors->at("output_logits").getPtr<float>();
-            invokeCudaCast(output_logits, logits_buf_, batch_size * seq_len * vocab_size_, stream_);
-            sync_check_cuda_error();
-        }
+        cublas_wrapper_->setFP16GemmConfig();
+
+        //        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+        //                              CUBLAS_OP_N,
+        //                              vocab_size_,
+        //                              num_tokens,
+        //                              hidden_units_,
+        //                              llama_weights->post_decoder_embedding.kernel,
+        //                              vocab_size_,
+        //                              normed_decoder_output_buf_,
+        //                              hidden_units_,  // n
+        //                              logits_buf_,
+        //                              vocab_size_);
+        //        sync_check_cuda_error();
+        //
+        //        if (std::is_same<T, half>::value) {
+        //            float* output_logits = output_tensors->at("output_logits").getPtr<float>();
+        //            invokeCudaCast(output_logits, logits_buf_, num_tokens * vocab_size_, stream_);
+        //            sync_check_cuda_error();
+        //        }
     }
 }
 
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 62af1a6d2..34e8c7ae9 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -38,6 +38,12 @@ class LLaMA: public BaseLayer {
     size_t random_seed_;
     size_t max_seq_len_;
 
+    static constexpr int num_buffers_ = 5;
+    int                  buf_no_      = 0;
+    cudaStream_t         comm_stream_;
+    cudaEvent_t          kern_event_[num_buffers_];
+    cudaEvent_t          comm_event_[num_buffers_];
+
     static constexpr float layernorm_eps_ = 1e-6f;
 
     size_t hidden_units_;
@@ -59,16 +65,17 @@ class LLaMA: public BaseLayer {
     void initialize();
 
 protected:
-    T*   input_attention_mask_ = nullptr;
-    T*   key_cache_            = nullptr;
-    T*   value_cache_          = nullptr;
+    T* input_attention_mask_ = nullptr;
+    T* key_cache_            = nullptr;
+    T* value_cache_          = nullptr;
 
     T* decoder_output_buf_        = nullptr;
     T* normed_decoder_output_buf_ = nullptr;
     T* logits_buf_                = nullptr;
 
-    T* context_decoder_input_buf_  = nullptr;
-    T* context_decoder_output_buf_ = nullptr;
+    T* context_decoder_input_buf_                      = nullptr;
+    T* context_decoder_output_buf_                     = nullptr;
+    T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 9eaa81758..0294c58d4 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -181,7 +181,8 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
                                                               {"input_lengths", input_tensors->at(2)},
                                                               {"context_lengths", input_tensors->at(3)},
                                                               {"num_tokens", input_tensors->at(4)},
-                                                              {"max_length", input_tensors->at(5)}};
+                                                              {"seq_len", input_tensors->at(5)},
+                                                              {"attn_len", input_tensors->at(6)}};
     std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
                                                                {"key_cache", output_tensors->at(1)},
                                                                {"value_cache", output_tensors->at(2)}};
@@ -195,31 +196,29 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                      const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
 {
     // input tensors:
-    //      decoder_input [batch_size, seq_len, hidden_dimension],
-    //      attention_mask [batch_size, 1, seq_len, seq_len]
+    //      decoder_input [num_tokens, hidden_dimension],
+    //      attention_mask [batch_size, 1, seq_len, attn_len]
     //      input_lengths [batch_size]
     //      context_lengths [batch_size]
     //      num_tokens [1] int on cpu
-    //      max_length [1] int on cpu
+    //      seq_len [1] int on cpu
+    //      attn_len [1] int on cpu
 
     // output tensors:
-    //      decoder_output [batch_size, seq_len, hidden_dimension],
+    //      decoder_output [num_tokens, hidden_dimension],
     //      key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head]
 
-    // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * batch_size'.
-    // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
-    // computing.
-
-    FT_CHECK(input_tensors->size() == 6);
+    FT_CHECK(input_tensors->size() == 7);
     FT_CHECK(output_tensors->size() == 3);
     const DataType data_type       = getTensorType<T>();
     const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
-    const int      batch_size      = input_tensors->at("decoder_input").shape[0];
-    const int      seq_len         = input_tensors->at("decoder_input").shape[1];
+    const int      batch_size      = input_tensors->at("input_lengths").shape[0];
     const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
     const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
-    const int      max_length      = input_tensors->at("max_length").getVal<int>(0);
+    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
+    const int      seq_len         = input_tensors->at("attention_mask").shape[2];
+    const int      attn_len        = input_tensors->at("attention_mask").shape[3];
 
     const size_t max_seq_len = output_tensors->at("key_cache").shape[3];
     allocateBuffer(batch_size, seq_len, max_seq_len);
@@ -248,10 +247,11 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
         sync_check_cuda_error();
 
-        h_token_num = input_tensors->at("num_tokens").getVal<int>();
+        h_token_num = num_tokens;
 
-        invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_, stream_);
-        sync_check_cuda_error();
+        //       invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_,
+        //       stream_);
+        //        sync_check_cuda_error();
     }
 
     for (int l = 0; l < num_layer_; l++) {
@@ -262,14 +262,14 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         const bool is_final     = false;
         T*         layer_input  = decoder_layer_output_;
         T*         layer_output = decoder_layer_output_;
-        if (!is_unpadded_mha) {
-            if (isFirstLayerParallelId(l)) {
-                layer_input = decoder_input;
-            }
-            if (isLastLayerParallelId(l)) {
-                layer_output = decoder_output;
-            }
+        //        if (!is_unpadded_mha) {
+        if (isFirstLayerParallelId(l)) {
+            layer_input = decoder_input;
+        }
+        if (isLastLayerParallelId(l)) {
+            layer_output = decoder_output;
         }
+        //        }
 
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
                                     layer_input,
@@ -285,13 +285,13 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
-                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(max_length)},
+                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)},
                     attention_mask}},
             {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
             {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
             {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}},
             {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}},
-            {"max_length", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &max_length}},
+            {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &attn_len}},
         };
 
         if (is_unpadded_mha) {
@@ -346,11 +346,11 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         sync_check_cuda_error();
     }
 
-    if (is_unpadded_mha) {
-        invokeRebuildPadding(
-            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
-        sync_check_cuda_error();
-    }
+    //    if (is_unpadded_mha) {
+    //        invokeRebuildPadding(
+    //            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
+    //        sync_check_cuda_error();
+    //    }
 
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 8aada6aff..1e260eec6 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -78,7 +78,8 @@ th::Tensor LLaMA::forward(th::Tensor&   output_logits,
                           th::Tensor&   input_lengths,
                           th::Tensor&   context_lengths,
                           const int64_t num_tokens,
-                          const int64_t max_length)
+                          const int64_t seq_len,
+                          const int64_t attn_len)
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -87,9 +88,8 @@ th::Tensor LLaMA::forward(th::Tensor&   output_logits,
     CHECK_CONTIGUOUS(input_lengths);
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
 
-    const int batch_size = input_ids.size(0);
-    const int seq_len    = input_ids.size(1);
-    ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, max_length);
+    const int batch_size = input_lengths.size(0);
+    ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len);
     return output_logits;
 }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 365a07dd1..83c580ce8 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -34,7 +34,8 @@ class IFLLaMA {
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
                          const int   num_tokens,
-                         const int   max_length) = 0;
+                         const int   seq_len,
+                         const int   attn_len) = 0;
 };
 
 template<typename T>
@@ -115,8 +116,6 @@ class FTLLaMA: public IFLLaMA {
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
         cublasSetStream(cublasHandle, stream_);
 
-        /// ft::Allocator<ft::AllocatorType::CUDA> allocator =
-        // ft::Allocator<ft::AllocatorType::CUDA>(at::cuda::getCurrentCUDAStream().device_index());
         allocator_      = new ft::Allocator<ft::AllocatorType::TH>();
         cublas_wrapper_ = new ft::cublasMMWrapper(
             cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_);
@@ -176,28 +175,31 @@ class FTLLaMA: public IFLLaMA {
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
                          const int   num_tokens,
-                         const int   max_length) override
+                         const int   seq_len,
+                         const int   attn_len) override
     {
-
-        const size_t batch_size = (size_t)input_ids.size(0);
-        const size_t seq_len    = (size_t)input_ids.size(1);
+        const size_t batch_size = (size_t)input_lengths.size(0);
 
         std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"input_ids",
-             ft::Tensor{
-                 ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size, seq_len}, get_ptr<int>(input_ids)}},
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{batch_size, (size_t)seq_len},
+                        get_ptr<int>(input_ids)}},
             {"input_lengths",
              ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
             {"context_lengths",
-             ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
+             ft::Tensor{
+                 ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
             {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &num_tokens}},
-            {"max_length", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &max_length}}};
+            {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &seq_len}},
+            {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"output_logits",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_FP32,
-                        std::vector<size_t>{batch_size, seq_len, vocab_size_},
+                        std::vector<size_t>{batch_size, (size_t)seq_len, vocab_size_},
                         get_ptr<float>(output_logits)}}};
 
         try {
@@ -264,12 +266,13 @@ class LLaMA: public th::jit::CustomClassHolder {
 
     ~LLaMA();
 
-    th::Tensor forward(th::Tensor& output_logits,
-                       th::Tensor& input_ids,
-                       th::Tensor& input_lengths,
-                       th::Tensor& context_lengths,
-                       const int64_t   num_tokens,
-                       const int64_t   max_length);
+    th::Tensor forward(th::Tensor&   output_logits,
+                       th::Tensor&   input_ids,
+                       th::Tensor&   input_lengths,
+                       th::Tensor&   context_lengths,
+                       const int64_t num_tokens,
+                       const int64_t seq_len,
+                       const int64_t attn_len);
 
 private:
     const at::ScalarType    st_;

From 433f2c95032544aaef28a75cff7feca631674902 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Fri, 29 Sep 2023 20:28:07 +0000
Subject: [PATCH 42/55] ckpt

---
 src/fastertransformer/models/llama/LLaMA.cc   | 34 +++++++++++++++++++
 src/fastertransformer/models/llama/LLaMA.h    |  8 +++--
 .../models/llama/LLaMAContextDecoder.cc       | 33 ++++++------------
 .../models/llama/LLaMAContextDecoder.h        |  2 --
 4 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index fb71c966f..b9162c041 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -61,6 +61,10 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_;
 
+    padding_offset_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
+    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+
     input_attention_mask_ =
         (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false));
 
@@ -88,6 +92,8 @@ template<typename T>
 void LLaMA<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
+        allocator_->free((void**)(&padding_offset_));
+        allocator_->free((void**)(&cu_seqlens_));
         allocator_->free((void**)(&input_attention_mask_));
         allocator_->free((void**)(&key_cache_));
         allocator_->free((void**)(&context_decoder_input_buf_));
@@ -238,6 +244,16 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
 
+    if (is_unpadded_mha) {
+        invokeLLaMAGetPaddingOffsetAndCuSeqLens(
+            padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
+        sync_check_cuda_error();
+
+        //       invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_,
+        //       stream_);
+        //        sync_check_cuda_error();
+    }
+
     invokeLLaMABuildDecoderAttentionMask(
         input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_);
     sync_check_cuda_error();
@@ -252,8 +268,10 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
     else {
+        ftNcclGroupStart();
         ftNcclRecv(
             context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+        ftNcclGroupEnd();
         sync_check_cuda_error();
     }
 
@@ -268,6 +286,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}},
         {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}};
 
+    if (is_unpadded_mha) {
+        decoder_input_tensors.insert(
+            {"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)num_tokens}, padding_offset_}});
+        decoder_input_tensors.insert(
+            {"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}});
+    }
+
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
          Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}},
@@ -286,6 +311,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
+
+    //    if (is_unpadded_mha) {
+    //        invokeRebuildPadding(
+    //            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
+    //        sync_check_cuda_error();
+    //    }
+
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
         buf_no_ = (buf_no_ + 1) % num_buffers_;
         check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
@@ -296,11 +328,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
         check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
         check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        ftNcclGroupStart();
         ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
                    num_tokens * hidden_units_,
                    pipeline_para_.rank_ + 1,
                    pipeline_para_,
                    comm_stream_);
+        ftNcclGroupEnd();
         check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
         sync_check_cuda_error();
     }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 34e8c7ae9..35ff68fd6 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -65,9 +65,11 @@ class LLaMA: public BaseLayer {
     void initialize();
 
 protected:
-    T* input_attention_mask_ = nullptr;
-    T* key_cache_            = nullptr;
-    T* value_cache_          = nullptr;
+    int* padding_offset_       = nullptr;
+    int* cu_seqlens_           = nullptr;
+    T*   input_attention_mask_ = nullptr;
+    T*   key_cache_            = nullptr;
+    T*   value_cache_          = nullptr;
 
     T* decoder_output_buf_        = nullptr;
     T* normed_decoder_output_buf_ = nullptr;
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 0294c58d4..4a257a405 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -62,9 +62,6 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 template<typename T>
 void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
 {
-    padding_offset_ =
-        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
-    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
 
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -82,8 +79,6 @@ void LLaMAContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&decoder_normed_input_));
         allocator_->free((void**)(&self_attn_output_));
         allocator_->free((void**)(&decoder_layer_output_));
-        allocator_->free((void**)(&cu_seqlens_));
-        allocator_->free((void**)(&padding_offset_));
         is_allocate_buffer_ = false;
     }
 }
@@ -203,13 +198,15 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      num_tokens [1] int on cpu
     //      seq_len [1] int on cpu
     //      attn_len [1] int on cpu
+    //      padding_offset [batch_size] int on cpu
+    //      cu_seqlens [batch_size+1] int on cpu
 
     // output tensors:
     //      decoder_output [num_tokens, hidden_dimension],
     //      key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
     //      value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head]
 
-    FT_CHECK(input_tensors->size() == 7);
+    FT_CHECK(input_tensors->size() >= 7);
     FT_CHECK(output_tensors->size() == 3);
     const DataType data_type       = getTensorType<T>();
     const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
@@ -219,6 +216,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
     const int      seq_len         = input_tensors->at("attention_mask").shape[2];
     const int      attn_len        = input_tensors->at("attention_mask").shape[3];
+    const int*     padding_offset  = nullptr;
+    const int*     cu_seqlens      = nullptr;
+    if (is_unpadded_mha) {
+        padding_offset = input_tensors->at("padding_offset").getPtr<int>();
+        cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
+    }
 
     const size_t max_seq_len = output_tensors->at("key_cache").shape[3];
     allocateBuffer(batch_size, seq_len, max_seq_len);
@@ -243,15 +246,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
     size_t h_token_num = batch_size * seq_len;
     if (is_unpadded_mha) {
-        invokeLLaMAGetPaddingOffsetAndCuSeqLens(
-            padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
-        sync_check_cuda_error();
-
         h_token_num = num_tokens;
-
-        //       invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_,
-        //       stream_);
-        //        sync_check_cuda_error();
     }
 
     for (int l = 0; l < num_layer_; l++) {
@@ -296,9 +291,9 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         if (is_unpadded_mha) {
             self_attention_input_tensors.insert("padding_offset",
-                                                Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_});
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset});
             self_attention_input_tensors.insert("cu_seqlens",
-                                                Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_});
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens});
         }
 
         size_t cache_offset = l - getFirstLayerParallelId();
@@ -346,12 +341,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         sync_check_cuda_error();
     }
 
-    //    if (is_unpadded_mha) {
-    //        invokeRebuildPadding(
-    //            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
-    //        sync_check_cuda_error();
-    //    }
-
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index d76ff0687..eb4e64ef0 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -67,8 +67,6 @@ class LLaMAContextDecoder: public BaseLayer {
     void initialize();
 
 protected:
-    int*    padding_offset_                    = nullptr;
-    int*    cu_seqlens_                        = nullptr;
     T*      decoder_normed_input_              = nullptr;
     T*      self_attn_output_                  = nullptr;
     T*      decoder_layer_output_              = nullptr;

From b63b4969db2059e3790c13cf87c5f4874230fc36 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 00:03:50 +0000
Subject: [PATCH 43/55] ckpt

---
 .../kernels/llama_kernels.cu                  | 107 +++++++++++++++++-
 src/fastertransformer/kernels/llama_kernels.h |  12 ++
 src/fastertransformer/models/llama/LLaMA.cc   |  60 +++++-----
 src/fastertransformer/models/llama/LLaMA.h    |   7 +-
 src/fastertransformer/th_op/llama/LLaMA.cc    |  21 ++--
 src/fastertransformer/th_op/llama/LLaMA.h     |  38 +++++--
 6 files changed, 187 insertions(+), 58 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 95700cb1f..4110819a8 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -1,4 +1,5 @@
 #include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
 #include "src/fastertransformer/utils/cuda_fp8_utils.h"
 
 #include <algorithm>
@@ -10,6 +11,111 @@
 using namespace std;
 namespace fastertransformer {
 
+__global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size)
+{
+    // logits [T, V]
+    // out [T, V]
+    const int64_t    ti = blockIdx.x;
+    __shared__ float s_sum, s_max;
+
+    if (ti >= num_tokens)
+        return;
+
+    float local_max = -1e20f;
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val = logits[ti * vocab_size + i];
+        local_max       = fmax(logit_val, local_max);
+    }
+
+    float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
+    if (threadIdx.x == 0) {
+        s_max = max_val;
+    }
+    __syncthreads();
+
+    float local_sum = 0;
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val = logits[ti * vocab_size + i];
+        local_sum += __expf(logit_val - s_max);
+    }
+    float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum<float>(local_sum);
+    if (threadIdx.x == 0) {
+        // s_sum = sum_val + 1e-6f;
+        s_sum = sum_val;
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val          = logits[ti * vocab_size + i];
+        out[ti * vocab_size + i] = (logit_val - s_max) - __logf(s_sum);
+    }
+}
+
+void invokeLLaMALogSoftmax(
+    float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream)
+{
+    dim3 grid(num_tokens);
+    dim3 block(min(1024, vocab_size));
+    LLaMA_log_softmax<<<grid, block, 0, stream>>>(out, logits, num_tokens, vocab_size);
+}
+
+__global__ void LLaMA_gather_tokens_kernel(float*       out,
+                                           const float* probs,
+                                           const int*   input_ids,
+                                           const int*   input_lengths,
+                                           const int*   cu_seqlens,
+                                           const int    batch_size,
+                                           const int    vocab_size)
+{
+    /*
+    // probs: [T, V]
+    // input_ids: [T]
+    int batch_idx = blockIdx.x;
+
+    if (batch_idx >= batch_size)
+        return;
+
+    float val = 0.f;
+    //    for (int i = cu_seqlens[batch_idx] + threadIdx.x; i < cu_seqlens[batch_idx + 1] - 1; i += blockDim.x) {
+    //        int input_idx = input_ids[i + 1];
+    //        val += probs[i * vocab_size + input_idx];
+    //    }
+    for (int t = cu_seqlens[batch_idx]; t < cu_seqlens[batch_idx + 1] - 1; ++t) {
+        val += probs[t * vocab_size + input_ids[t + 1]];
+    }
+    //float sum = blockReduceSum<float>(val);
+
+    if (threadIdx.x == 0)
+        out[batch_idx] = val;
+        */
+    // for b in range(bsz):
+    //     for i in range(choice_seq_lens_list[c][b]-1):
+    //       t = choice_cum_seq_lens_list[c][b] + i
+    //       choice_log_probs[b, c] = choice_log_probs[b, c] +  log_likelihoods[t, choice_tokens_list[c][t+1]]
+
+    for (int b = 0; b < batch_size; ++b) {
+        float val = 0.f;
+        for (int i = 0; i < input_lengths[b] - 1; ++i) {
+            int t = cu_seqlens[b] + i;
+            val += probs[t * vocab_size + input_ids[t + 1]];
+        }
+        out[b] = val;
+    }
+}
+
+void invokeLLaMAGatherTokens(float*       out,
+                             const float* probs,
+                             const int*   input_ids,
+                             const int*   input_lengths,
+                             const int*   cu_seqlens,
+                             const int    batch_size,
+                             const int    vocab_size,
+                             cudaStream_t stream)
+{
+    LLaMA_gather_tokens_kernel<<<1, 1, 0, stream>>>(
+        out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size);
+}
+
 template<typename T>
 __global__ void LLaMAstart_id_embedding_lookups_kernel(
     T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units)
@@ -145,7 +251,6 @@ template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
 template<typename T>
 __global__ void LLaMACopyKernel(T* dst, T* src, const int count)
 {
-
     int           idx     = blockIdx.x * blockDim.x + threadIdx.x;
     constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
     if (idx * X_ELEMS >= count) {
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index 2d1c9592e..66488462d 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -29,4 +29,16 @@ void invokeLLaMAInputIdsEmbeddingLookup(T*           from_tensor,
 
 template<typename T>
 void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
+
+void invokeLLaMAGatherTokens(float*       out,
+                             const float* probs,
+                             const int*   input_ids,
+                             const int*   input_lengths,
+                             const int*   cu_seqlens,
+                             const int    batch_size,
+                             const int    vocab_size,
+                             cudaStream_t stream);
+
+void invokeLLaMALogSoftmax(
+    float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream);
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index b9162c041..854df9de6 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -83,7 +83,10 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
 
     normed_decoder_output_buf_ =
         (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
-    logits_buf_ = (T*)(allocator_->reMalloc(logits_buf_, sizeof(T) * batch_size * seq_len * vocab_size_, false));
+    logits_buf_ =
+        (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
+    log_likelihood_buf_ =
+        (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
 
     is_allocate_buffer_ = true;
 }
@@ -225,7 +228,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      attn_len [1] int on cpu
 
     // output_tensors:
-    //      output_logits [num_tokens, vocab_size]
+    //      hidden_vector [num_tokens, hidden_size]
+    //      log_probs [num_tokens, vocab_size]
+    //      out_log_probs [batch_size]
 
     FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
@@ -240,6 +245,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
     const int      seq_len         = input_tensors->at("seq_len").getVal<int>();
     const int      attn_len        = input_tensors->at("attn_len").getVal<int>();
+    T*             hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
+    float*         log_probs       = output_tensors->at("log_probs").getPtr<float>();
+    float*         out_log_probs   = output_tensors->at("out_log_probs").getPtr<float>();
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
@@ -277,7 +285,10 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
-         Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_input_buf_}},
+         Tensor{MEMORY_GPU,
+                data_type,
+                {(size_t)num_tokens, hidden_units_},
+                (pipeline_para_.rank_ == 0) ? context_decoder_input_buf_ : hidden_vector}},
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
@@ -294,8 +305,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
-        {"decoder_output",
-         Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, context_decoder_output_buf_}},
+        {"decoder_output", Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, hidden_vector}},
         {"key_cache",
          Tensor{MEMORY_GPU,
                 data_type,
@@ -311,7 +321,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-
     //    if (is_unpadded_mha) {
     //        invokeRebuildPadding(
     //            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
@@ -321,10 +330,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
         buf_no_ = (buf_no_ + 1) % num_buffers_;
         check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
-        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
-                              context_decoder_output_buf_,
-                              num_tokens * hidden_units_,
-                              stream_);
+        invokeLLaMACopyKernel(
+            context_decoder_output_buf_clone_[buf_no_], hidden_vector, num_tokens * hidden_units_, stream_);
         sync_check_cuda_error();
         check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
         check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
@@ -340,7 +347,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
     else {
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
-                                    context_decoder_output_buf_,
+                                    hidden_vector,
                                     llama_weights->post_decoder_layernorm.gamma,
                                     layernorm_eps_,
                                     num_tokens,
@@ -348,9 +355,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                     stream_);
         sync_check_cuda_error();
 
-        float  alpha         = 1.0f;
-        float  beta          = 0.0f;
-        float* output_logits = output_tensors->at("output_logits").getPtr<float>();
+        float alpha = 1.0f;
+        float beta  = 0.0f;
         cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F);
         cublas_wrapper_->Gemm(CUBLAS_OP_N,
                               CUBLAS_OP_N,
@@ -361,29 +367,17 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                               vocab_size_,
                               normed_decoder_output_buf_,
                               hidden_units_,  // n
-                              output_logits,
+                              logits_buf_,
                               vocab_size_);
         sync_check_cuda_error();
         cublas_wrapper_->setFP16GemmConfig();
 
-        //        cublas_wrapper_->Gemm(CUBLAS_OP_N,
-        //                              CUBLAS_OP_N,
-        //                              vocab_size_,
-        //                              num_tokens,
-        //                              hidden_units_,
-        //                              llama_weights->post_decoder_embedding.kernel,
-        //                              vocab_size_,
-        //                              normed_decoder_output_buf_,
-        //                              hidden_units_,  // n
-        //                              logits_buf_,
-        //                              vocab_size_);
-        //        sync_check_cuda_error();
-        //
-        //        if (std::is_same<T, half>::value) {
-        //            float* output_logits = output_tensors->at("output_logits").getPtr<float>();
-        //            invokeCudaCast(output_logits, logits_buf_, num_tokens * vocab_size_, stream_);
-        //            sync_check_cuda_error();
-        //        }
+        invokeLLaMALogSoftmax(log_probs, logits_buf_, num_tokens, vocab_size_, stream_);
+        sync_check_cuda_error();
+
+        invokeLLaMAGatherTokens(
+            out_log_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_);
+        sync_check_cuda_error();
     }
 }
 
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 35ff68fd6..5f582510d 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -71,9 +71,10 @@ class LLaMA: public BaseLayer {
     T*   key_cache_            = nullptr;
     T*   value_cache_          = nullptr;
 
-    T* decoder_output_buf_        = nullptr;
-    T* normed_decoder_output_buf_ = nullptr;
-    T* logits_buf_                = nullptr;
+    T*     decoder_output_buf_        = nullptr;
+    T*     normed_decoder_output_buf_ = nullptr;
+    float* logits_buf_                = nullptr;
+    float* log_likelihood_buf_        = nullptr;
 
     T* context_decoder_input_buf_                      = nullptr;
     T* context_decoder_output_buf_                     = nullptr;
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 1e260eec6..3849e7c27 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -73,13 +73,15 @@ LLaMA::~LLaMA()
     delete ftllama;
 }
 
-th::Tensor LLaMA::forward(th::Tensor&   output_logits,
-                          th::Tensor&   input_ids,
-                          th::Tensor&   input_lengths,
-                          th::Tensor&   context_lengths,
-                          const int64_t num_tokens,
-                          const int64_t seq_len,
-                          const int64_t attn_len)
+std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
+                                       th::Tensor&   log_probs,
+                                       th::Tensor&   out_log_probs,
+                                       th::Tensor&   input_ids,
+                                       th::Tensor&   input_lengths,
+                                       th::Tensor&   context_lengths,
+                                       const int64_t num_tokens,
+                                       const int64_t seq_len,
+                                       const int64_t attn_len)
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -89,8 +91,9 @@ th::Tensor LLaMA::forward(th::Tensor&   output_logits,
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
 
     const int batch_size = input_lengths.size(0);
-    ftllama->forward(output_logits, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len);
-    return output_logits;
+    ftllama->forward(
+        hidden_vector, log_probs, out_log_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len);
+    return std::vector<th::Tensor>{hidden_vector, log_probs, out_log_probs};
 }
 
 }  // namespace torch_ext
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 83c580ce8..a853f0818 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -29,7 +29,9 @@ using std::vector;
 class IFLLaMA {
 public:
     virtual ~IFLLaMA() {}
-    virtual void forward(th::Tensor& output_logits,
+    virtual void forward(th::Tensor& hidden_vector,
+                         th::Tensor& log_probs,
+                         th::Tensor& out_log_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
@@ -170,7 +172,9 @@ class FTLLaMA: public IFLLaMA {
         delete cublas_wrapper_mutex_;
     }
 
-    virtual void forward(th::Tensor& output_logits,
+    virtual void forward(th::Tensor& hidden_vector,
+                         th::Tensor& log_probs,
+                         th::Tensor& out_log_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
@@ -196,11 +200,19 @@ class FTLLaMA: public IFLLaMA {
             {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
-            {"output_logits",
+            {"hidden_vector",
+             ft::Tensor{ft::MEMORY_GPU,
+                        (std::is_same<T, half>::value) ? ft::TYPE_FP16 : ft::TYPE_FP32,
+                        std::vector<size_t>{(size_t)num_tokens, num_heads_ * size_per_head_},
+                        get_ptr<T>(hidden_vector)}},
+            {"log_probs",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_FP32,
-                        std::vector<size_t>{batch_size, (size_t)seq_len, vocab_size_},
-                        get_ptr<float>(output_logits)}}};
+                        std::vector<size_t>{(size_t)num_tokens, vocab_size_},
+                        get_ptr<float>(log_probs)}},
+            {"out_log_probs",
+             ft::Tensor{
+                 ft::MEMORY_GPU, ft::TYPE_FP32, std::vector<size_t>{batch_size}, get_ptr<float>(out_log_probs)}}};
 
         try {
             ft::check_cuda_error(cudaEventSynchronize(event_));
@@ -266,13 +278,15 @@ class LLaMA: public th::jit::CustomClassHolder {
 
     ~LLaMA();
 
-    th::Tensor forward(th::Tensor&   output_logits,
-                       th::Tensor&   input_ids,
-                       th::Tensor&   input_lengths,
-                       th::Tensor&   context_lengths,
-                       const int64_t num_tokens,
-                       const int64_t seq_len,
-                       const int64_t attn_len);
+    std::vector<th::Tensor> forward(th::Tensor&   hidden_vector,
+                                    th::Tensor&   log_probs,
+                                    th::Tensor&   out_log_probs,
+                                    th::Tensor&   input_ids,
+                                    th::Tensor&   input_lengths,
+                                    th::Tensor&   context_lengths,
+                                    const int64_t num_tokens,
+                                    const int64_t seq_len,
+                                    const int64_t attn_len);
 
 private:
     const at::ScalarType    st_;

From 6ee6105f1ae3b0f8a63b5fee4bc88edfb281c714 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 19:40:03 +0000
Subject: [PATCH 44/55] 08:42

---
 .../kernels/llama_kernels.cu                  | 24 ++++---
 src/fastertransformer/models/llama/LLaMA.cc   | 65 +++++++++++--------
 src/fastertransformer/models/llama/LLaMA.h    |  2 +-
 src/fastertransformer/th_op/llama/LLaMA.cc    | 10 +--
 src/fastertransformer/th_op/llama/LLaMA.h     | 23 ++++---
 5 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 4110819a8..0184bc0d6 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -93,14 +93,22 @@ __global__ void LLaMA_gather_tokens_kernel(float*       out,
     //       t = choice_cum_seq_lens_list[c][b] + i
     //       choice_log_probs[b, c] = choice_log_probs[b, c] +  log_likelihoods[t, choice_tokens_list[c][t+1]]
 
-    for (int b = 0; b < batch_size; ++b) {
-        float val = 0.f;
-        for (int i = 0; i < input_lengths[b] - 1; ++i) {
-            int t = cu_seqlens[b] + i;
-            val += probs[t * vocab_size + input_ids[t + 1]];
-        }
-        out[b] = val;
+    // probs: [T, V]
+    // input_ids: [T]
+    int batch_idx = blockIdx.x;
+
+    if (batch_idx >= batch_size)
+        return;
+
+    float val = 0.f;
+    for (int i = threadIdx.x; i < input_lengths[batch_idx] - 1; i += blockDim.x) {
+        int t = cu_seqlens[batch_idx] + i;
+        val += probs[t * vocab_size + input_ids[t + 1]];
     }
+    float sum = blockReduceSum<float>(val);
+
+    if (threadIdx.x == 0)
+        out[batch_idx] = sum;
 }
 
 void invokeLLaMAGatherTokens(float*       out,
@@ -112,7 +120,7 @@ void invokeLLaMAGatherTokens(float*       out,
                              const int    vocab_size,
                              cudaStream_t stream)
 {
-    LLaMA_gather_tokens_kernel<<<1, 1, 0, stream>>>(
+    LLaMA_gather_tokens_kernel<<<batch_size, 256, 0, stream>>>(
         out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size);
 }
 
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 854df9de6..6ab73fb66 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -226,11 +226,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      num_tokens [1] int on cpu
     //      seq_len [1] int on cpu
     //      attn_len [1] int on cpu
+    //      is_context [1] int on cpu
 
     // output_tensors:
     //      hidden_vector [num_tokens, hidden_size]
     //      log_probs [num_tokens, vocab_size]
-    //      out_log_probs [batch_size]
+    //      cum_probs [batch_size]
 
     FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
@@ -245,9 +246,10 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
     const int      seq_len         = input_tensors->at("seq_len").getVal<int>();
     const int      attn_len        = input_tensors->at("attn_len").getVal<int>();
+    const int      is_context      = input_tensors->at("is_context").getVal<int>();
     T*             hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
     float*         log_probs       = output_tensors->at("log_probs").getPtr<float>();
-    float*         out_log_probs   = output_tensors->at("out_log_probs").getPtr<float>();
+    float*         cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
@@ -276,11 +278,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
     else {
-        ftNcclGroupStart();
-        ftNcclRecv(
-            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
-        ftNcclGroupEnd();
-        sync_check_cuda_error();
+        //        ftNcclGroupStart();
+        //        ftNcclRecv(
+        //            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
+        //            stream_);
+        //        ftNcclGroupEnd();
+        //        sync_check_cuda_error();
     }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
@@ -288,7 +291,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
          Tensor{MEMORY_GPU,
                 data_type,
                 {(size_t)num_tokens, hidden_units_},
-                (pipeline_para_.rank_ == 0) ? context_decoder_input_buf_ : hidden_vector}},
+                pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector}},
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
@@ -305,7 +308,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
-        {"decoder_output", Tensor{MEMORY_GPU, data_type, {(size_t)num_tokens, hidden_units_}, hidden_vector}},
+        {"decoder_output",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {(size_t)num_tokens, hidden_units_},
+                (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ :
+                                                                           hidden_vector}},
         {"key_cache",
          Tensor{MEMORY_GPU,
                 data_type,
@@ -328,26 +336,27 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //    }
 
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
-        buf_no_ = (buf_no_ + 1) % num_buffers_;
-        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
-        invokeLLaMACopyKernel(
-            context_decoder_output_buf_clone_[buf_no_], hidden_vector, num_tokens * hidden_units_, stream_);
-        sync_check_cuda_error();
-        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
-        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
-        ftNcclGroupStart();
-        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
-                   num_tokens * hidden_units_,
-                   pipeline_para_.rank_ + 1,
-                   pipeline_para_,
-                   comm_stream_);
-        ftNcclGroupEnd();
-        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
-        sync_check_cuda_error();
+        //        buf_no_ = (buf_no_ + 1) % num_buffers_;
+        //        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
+        //        invokeLLaMACopyKernel(
+        //            context_decoder_output_buf_clone_[buf_no_], context_decoder_output_buf_, num_tokens *
+        //            hidden_units_, stream_);
+        //        sync_check_cuda_error();
+        //        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
+        //        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        //        ftNcclGroupStart();
+        //        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
+        //                   num_tokens * hidden_units_,
+        //                   pipeline_para_.rank_ + 1,
+        //                   pipeline_para_,
+        //                   comm_stream_);
+        //        ftNcclGroupEnd();
+        //        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
+        //        sync_check_cuda_error();
     }
-    else {
+    else if (!is_context){
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
-                                    hidden_vector,
+                                    context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
                                     layernorm_eps_,
                                     num_tokens,
@@ -376,7 +385,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
 
         invokeLLaMAGatherTokens(
-            out_log_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_);
+            cum_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_);
         sync_check_cuda_error();
     }
 }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 5f582510d..f4143ee39 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -38,7 +38,7 @@ class LLaMA: public BaseLayer {
     size_t random_seed_;
     size_t max_seq_len_;
 
-    static constexpr int num_buffers_ = 5;
+    static constexpr int num_buffers_ = 10;
     int                  buf_no_      = 0;
     cudaStream_t         comm_stream_;
     cudaEvent_t          kern_event_[num_buffers_];
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 3849e7c27..b098f28f7 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -75,13 +75,15 @@ LLaMA::~LLaMA()
 
 std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
                                        th::Tensor&   log_probs,
-                                       th::Tensor&   out_log_probs,
+                                       th::Tensor&   cum_probs,
                                        th::Tensor&   input_ids,
                                        th::Tensor&   input_lengths,
                                        th::Tensor&   context_lengths,
                                        const int64_t num_tokens,
                                        const int64_t seq_len,
-                                       const int64_t attn_len)
+                                       const int64_t attn_len,
+                                       const int64_t is_context
+                                       )
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -92,8 +94,8 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
 
     const int batch_size = input_lengths.size(0);
     ftllama->forward(
-        hidden_vector, log_probs, out_log_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len);
-    return std::vector<th::Tensor>{hidden_vector, log_probs, out_log_probs};
+        hidden_vector, log_probs, cum_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len, is_context);
+    return std::vector<th::Tensor>{hidden_vector, log_probs, cum_probs};
 }
 
 }  // namespace torch_ext
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index a853f0818..ff2caa238 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -31,13 +31,14 @@ class IFLLaMA {
     virtual ~IFLLaMA() {}
     virtual void forward(th::Tensor& hidden_vector,
                          th::Tensor& log_probs,
-                         th::Tensor& out_log_probs,
+                         th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
                          const int   num_tokens,
                          const int   seq_len,
-                         const int   attn_len) = 0;
+                         const int   attn_len,
+                         const int   is_context) = 0;
 };
 
 template<typename T>
@@ -174,13 +175,14 @@ class FTLLaMA: public IFLLaMA {
 
     virtual void forward(th::Tensor& hidden_vector,
                          th::Tensor& log_probs,
-                         th::Tensor& out_log_probs,
+                         th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
                          th::Tensor& context_lengths,
                          const int   num_tokens,
                          const int   seq_len,
-                         const int   attn_len) override
+                         const int   attn_len,
+                         const int   is_context) override
     {
         const size_t batch_size = (size_t)input_lengths.size(0);
 
@@ -197,7 +199,8 @@ class FTLLaMA: public IFLLaMA {
                  ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
             {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &num_tokens}},
             {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &seq_len}},
-            {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}}};
+            {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}},
+            {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &is_context}}};
 
         std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"hidden_vector",
@@ -210,9 +213,8 @@ class FTLLaMA: public IFLLaMA {
                         ft::TYPE_FP32,
                         std::vector<size_t>{(size_t)num_tokens, vocab_size_},
                         get_ptr<float>(log_probs)}},
-            {"out_log_probs",
-             ft::Tensor{
-                 ft::MEMORY_GPU, ft::TYPE_FP32, std::vector<size_t>{batch_size}, get_ptr<float>(out_log_probs)}}};
+            {"cum_probs",
+             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector<size_t>{batch_size}, get_ptr<float>(cum_probs)}}};
 
         try {
             ft::check_cuda_error(cudaEventSynchronize(event_));
@@ -280,13 +282,14 @@ class LLaMA: public th::jit::CustomClassHolder {
 
     std::vector<th::Tensor> forward(th::Tensor&   hidden_vector,
                                     th::Tensor&   log_probs,
-                                    th::Tensor&   out_log_probs,
+                                    th::Tensor&   cum_probs,
                                     th::Tensor&   input_ids,
                                     th::Tensor&   input_lengths,
                                     th::Tensor&   context_lengths,
                                     const int64_t num_tokens,
                                     const int64_t seq_len,
-                                    const int64_t attn_len);
+                                    const int64_t attn_len,
+                                    const int64_t is_context);
 
 private:
     const at::ScalarType    st_;

From 1955508c793280e482b41fe03ea708ef4c3a41c6 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 19:47:23 +0000
Subject: [PATCH 45/55] # input check bug fix

---
 src/fastertransformer/models/llama/LLaMA.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 6ab73fb66..14e7971f9 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -233,7 +233,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     //      log_probs [num_tokens, vocab_size]
     //      cum_probs [batch_size]
 
-    FT_CHECK_WITH_INFO(input_tensors->size() == 6, "input_tensors->size() == 6");
+    FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 6");
     FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
 

From 57dded422703f7bcf789dd88ff93ebe3be74b74a Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 20:45:02 +0000
Subject: [PATCH 46/55] code rf

---
 .../attention_layers/LLaMAContextAttentionLayer.cc    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 0c2307fc8..c8777c4cc 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -85,11 +85,12 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
 
-    if (padding_offset != nullptr) {
-        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
-        sync_check_cuda_error();
-    }
+//    if (padding_offset != nullptr) {
+//        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
+//        cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
+//        sync_check_cuda_error();
+//    }
+
     invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
                                         k_buf_2_,
                                         v_buf_2_,

From d3a83aac7fabf702307faefc038b191ad76412de Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 21:30:14 +0000
Subject: [PATCH 47/55] add macro

---
 src/fastertransformer/models/llama/LLaMA.cc | 86 ++++++++++++---------
 src/fastertransformer/models/llama/LLaMA.h  |  8 +-
 2 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 14e7971f9..8ab1b1a6e 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -29,11 +29,14 @@ namespace fastertransformer {
 template<typename T>
 void LLaMA<T>::initialize()
 {
+#ifdef USE_NCCL
     check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking));
     for (int i = 0; i < num_buffers_; ++i) {
         check_cuda_error(cudaEventCreate(&kern_event_[i]));
         check_cuda_error(cudaEventCreate(&comm_event_[i]));
     }
+#endif
+
     llama_context_decoder_ = new LLaMAContextDecoder<T>(head_num_,
                                                         size_per_head_,
                                                         inter_size_,
@@ -76,10 +79,12 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
+#ifdef USE_NCCL
     for (int i = 0; i < num_buffers_; ++i) {
         context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc(
             context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
     }
+#endif
 
     normed_decoder_output_buf_ =
         (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -101,9 +106,11 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&key_cache_));
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
+#ifdef USE_NCCL
         for (int i = 0; i < num_buffers_; ++i) {
             allocator_->free((void**)(&context_decoder_output_buf_clone_[i]));
         }
+#endif
         allocator_->free((void**)(&logits_buf_));
         is_allocate_buffer_ = false;
     }
@@ -195,11 +202,13 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
 template<typename T>
 LLaMA<T>::~LLaMA()
 {
+#ifdef USE_NCCL
     check_cuda_error(cudaStreamDestroy(comm_stream_));
     for (int i = 0; i < num_buffers_; ++i) {
         check_cuda_error(cudaEventDestroy(kern_event_[i]));
         check_cuda_error(cudaEventDestroy(comm_event_[i]));
     }
+#endif
 
     delete llama_context_decoder_;
     freeBuffer();
@@ -258,10 +267,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         invokeLLaMAGetPaddingOffsetAndCuSeqLens(
             padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
         sync_check_cuda_error();
-
-        //       invokeRemovePadding(decoder_layer_output_, decoder_input, padding_offset_, h_token_num, hidden_units_,
-        //       stream_);
-        //        sync_check_cuda_error();
     }
 
     invokeLLaMABuildDecoderAttentionMask(
@@ -278,12 +283,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
     }
     else {
-        //        ftNcclGroupStart();
-        //        ftNcclRecv(
-        //            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
-        //            stream_);
-        //        ftNcclGroupEnd();
-        //        sync_check_cuda_error();
+#ifdef USE_NCCL
+        ftNcclGroupStart();
+        ftNcclRecv(
+            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
+        ftNcclGroupEnd();
+        sync_check_cuda_error();
+#endif
     }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
@@ -291,7 +297,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
          Tensor{MEMORY_GPU,
                 data_type,
                 {(size_t)num_tokens, hidden_units_},
-                pipeline_para_.rank_ == 0 ? context_decoder_input_buf_ : hidden_vector}},
+#ifdef USE_NCCL
+                context_decoder_input_buf_
+#else
+                pipeline_para_.rank_ == 0                                ? context_decoder_input_buf_ : hidden_vector
+#endif
+         }},
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
@@ -312,8 +323,12 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
          Tensor{MEMORY_GPU,
                 data_type,
                 {(size_t)num_tokens, hidden_units_},
-                (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ :
-                                                                           hidden_vector}},
+#ifdef USE_NCCL
+                context_decoder_output_buf_
+#else
+                (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector
+#endif
+         }},
         {"key_cache",
          Tensor{MEMORY_GPU,
                 data_type,
@@ -329,32 +344,29 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-    //    if (is_unpadded_mha) {
-    //        invokeRebuildPadding(
-    //            decoder_output, decoder_layer_output_, padding_offset_, h_token_num, hidden_units_, stream_);
-    //        sync_check_cuda_error();
-    //    }
-
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
-        //        buf_no_ = (buf_no_ + 1) % num_buffers_;
-        //        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
-        //        invokeLLaMACopyKernel(
-        //            context_decoder_output_buf_clone_[buf_no_], context_decoder_output_buf_, num_tokens *
-        //            hidden_units_, stream_);
-        //        sync_check_cuda_error();
-        //        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
-        //        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
-        //        ftNcclGroupStart();
-        //        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
-        //                   num_tokens * hidden_units_,
-        //                   pipeline_para_.rank_ + 1,
-        //                   pipeline_para_,
-        //                   comm_stream_);
-        //        ftNcclGroupEnd();
-        //        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
-        //        sync_check_cuda_error();
+#ifdef USE_NCCL
+        buf_no_ = (buf_no_ + 1) % num_buffers_;
+        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
+        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
+                              context_decoder_output_buf_,
+                              num_tokens * hidden_units_,
+                              stream_);
+        sync_check_cuda_error();
+        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
+        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        ftNcclGroupStart();
+        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
+                   num_tokens * hidden_units_,
+                   pipeline_para_.rank_ + 1,
+                   pipeline_para_,
+                   comm_stream_);
+        ftNcclGroupEnd();
+        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
+        sync_check_cuda_error();
+#endif
     }
-    else if (!is_context){
+    else if (!is_context) {
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index f4143ee39..c5cb1c233 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -23,6 +23,8 @@
 #include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 
+#define USE_NCCL
+
 namespace fastertransformer {
 
 template<typename T>
@@ -38,11 +40,14 @@ class LLaMA: public BaseLayer {
     size_t random_seed_;
     size_t max_seq_len_;
 
-    static constexpr int num_buffers_ = 10;
+#ifdef USE_NCCL
+    static constexpr int num_buffers_ = 5;
     int                  buf_no_      = 0;
     cudaStream_t         comm_stream_;
     cudaEvent_t          kern_event_[num_buffers_];
     cudaEvent_t          comm_event_[num_buffers_];
+    T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
+#endif
 
     static constexpr float layernorm_eps_ = 1e-6f;
 
@@ -78,7 +83,6 @@ class LLaMA: public BaseLayer {
 
     T* context_decoder_input_buf_                      = nullptr;
     T* context_decoder_output_buf_                     = nullptr;
-    T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);

From 1365462163606f06ac007a4e5f988c0b147d66b2 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 22:22:52 +0000
Subject: [PATCH 48/55] 07_03

---
 src/fastertransformer/models/llama/LLaMA.cc | 39 +++++++++++----------
 src/fastertransformer/models/llama/LLaMA.h  |  2 +-
 src/fastertransformer/th_op/llama/LLaMA.h   |  3 +-
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 8ab1b1a6e..b55e04149 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -284,10 +284,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
     else {
 #ifdef USE_NCCL
-        ftNcclGroupStart();
         ftNcclRecv(
             context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
-        ftNcclGroupEnd();
         sync_check_cuda_error();
 #endif
     }
@@ -346,24 +344,29 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
 #ifdef USE_NCCL
-        buf_no_ = (buf_no_ + 1) % num_buffers_;
-        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
-        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
-                              context_decoder_output_buf_,
-                              num_tokens * hidden_units_,
-                              stream_);
-        sync_check_cuda_error();
-        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
-        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        //        buf_no_ = (buf_no_ + 1) % num_buffers_;
+        //        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
+        //        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
+        //                              context_decoder_output_buf_,
+        //                              num_tokens * hidden_units_,
+        //                              stream_);
+        //        sync_check_cuda_error();
+        //        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
+        //        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
+        //        ftNcclGroupStart();
+        //        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
+        //                   num_tokens * hidden_units_,
+        //                   pipeline_para_.rank_ + 1,
+        //                   pipeline_para_,
+        //                   comm_stream_);
+        //        ftNcclGroupEnd();
+        //        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
+        //        sync_check_cuda_error();
+
         ftNcclGroupStart();
-        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
-                   num_tokens * hidden_units_,
-                   pipeline_para_.rank_ + 1,
-                   pipeline_para_,
-                   comm_stream_);
+        ftNcclSend(
+            context_decoder_output_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
         ftNcclGroupEnd();
-        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
-        sync_check_cuda_error();
 #endif
     }
     else if (!is_context) {
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index c5cb1c233..7bb73c524 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -23,7 +23,7 @@
 #include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 
-#define USE_NCCL
+//#define USE_NCCL
 
 namespace fastertransformer {
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index ff2caa238..3677515fb 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -113,7 +113,8 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
-        ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+        //ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+        ft::check_cuda_error(cudaStreamCreate(&stream_));
         ft::check_cuda_error(cudaEventCreate(&event_));
 
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();

From 305540f6510099dd544b295200ca664df9c5c051 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sat, 30 Sep 2023 22:54:55 +0000
Subject: [PATCH 49/55] add multiple devent

---
 src/fastertransformer/th_op/llama/LLaMA.h | 24 +++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 3677515fb..f5ffee6f7 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -113,9 +113,12 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
-        //ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+        // ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
         ft::check_cuda_error(cudaStreamCreate(&stream_));
-        ft::check_cuda_error(cudaEventCreate(&event_));
+
+        for (int i = 0; i < num_events_; ++i) {
+            ft::check_cuda_error(cudaEventCreate(&event_[i]));
+        }
 
         cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
         cublasSetStream(cublasHandle, stream_);
@@ -160,7 +163,9 @@ class FTLLaMA: public IFLLaMA {
 
     ~FTLLaMA() override
     {
-        ft::check_cuda_error(cudaEventDestroy(event_));
+        for (int i = 0; i < num_events_; ++i) {
+            ft::check_cuda_error(cudaEventDestroy(event_[i]));
+        }
         ft::check_cuda_error(cudaStreamDestroy(stream_));
 
         delete llama_;
@@ -218,12 +223,13 @@ class FTLLaMA: public IFLLaMA {
              ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector<size_t>{batch_size}, get_ptr<float>(cum_probs)}}};
 
         try {
-            ft::check_cuda_error(cudaEventSynchronize(event_));
+            ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_]));
             llama_->forward(&output_tensors, &input_tensors, &llama_weights_);
-            ft::check_cuda_error(cudaEventRecord(event_, stream_));
+            ft::check_cuda_error(cudaEventRecord(event_[ev_no_], stream_));
 
             auto stream = at::cuda::getCurrentCUDAStream().stream();
-            ft::check_cuda_error(cudaStreamWaitEvent(stream, event_));
+            ft::check_cuda_error(cudaStreamWaitEvent(stream, event_[ev_no_]));
+            ev_no_ = (ev_no_ + 1) % num_events_;
         }
         catch (std::runtime_error& error) {
             std::cout << error.what();
@@ -247,8 +253,10 @@ class FTLLaMA: public IFLLaMA {
     int64_t      tensor_para_size_;
     int64_t      pipeline_para_size_;
 
-    cudaStream_t stream_;
-    cudaEvent_t  event_;
+    static constexpr int num_events_ = 5;
+    int                  ev_no_      = 0;
+    cudaEvent_t          event_[num_events_];
+    cudaStream_t         stream_;
 
     std::vector<th::Tensor> weights_;
     cublasLtHandle_t        cublasltHandle_;

From 294a6fc91089289d8aca50d0ed1c76a3df673ee7 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 1 Oct 2023 01:47:34 +0000
Subject: [PATCH 50/55] ft_llama-06_48

---
 .../kernels/llama_kernels.cu                  | 114 ++++++++++++------
 src/fastertransformer/kernels/llama_kernels.h |  17 ++-
 src/fastertransformer/models/llama/LLaMA.cc   |  89 ++++++++++----
 src/fastertransformer/models/llama/LLaMA.h    |   7 +-
 .../models/llama/LLaMAContextDecoder.cc       |  48 +++-----
 src/fastertransformer/th_op/llama/LLaMA.cc    |  18 ++-
 src/fastertransformer/th_op/llama/LLaMA.h     |  28 +++--
 7 files changed, 214 insertions(+), 107 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index 0184bc0d6..d007350fe 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -11,6 +11,70 @@
 using namespace std;
 namespace fastertransformer {
 
+template<typename T>
+__global__ void LLaMA_get_last_tokens(T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size)
+{
+    // in [num_tokens, hidden_size]
+    // out [batch_size, hidden_size]
+    int batch_idx = blockIdx.x;
+
+    if (batch_idx >= batch_size)
+        return;
+
+    int pos = cu_seqlens[batch_idx + 1] - 1;
+
+    for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+        out[batch_idx * hidden_size + idx] = in[pos * hidden_size + idx];
+    }
+}
+
+template<typename T>
+void invokeLLaMAGetLastTokens(
+    T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream)
+{
+    dim3 grid(batch_size);
+    dim3 block(256);
+    LLaMA_get_last_tokens<<<grid, block, 0, stream>>>(out, in, cu_seqlens, batch_size, hidden_size);
+}
+
+template void invokeLLaMAGetLastTokens(
+    float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+template void invokeLLaMAGetLastTokens(
+    half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+
+__global__ void LLaMA_extract_targets(
+    float* out, float* in, const int* target_ids, const int* cu_seqlens, int beam_width, int batch_size, int vocab_size, int num_tokens)
+{
+    // in [batch_size, vocab_size]
+    // target_ids [ beam_width, num_tokens ]
+    // out [beam_width, batch_size]
+    int batch_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int beam_idx  = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (batch_idx >= batch_size || beam_idx >= beam_width)
+        return;
+
+    int pos                                = cu_seqlens[batch_idx + 1] - 1;
+    int target_idx                         = target_ids[beam_idx * num_tokens + pos];
+    out[beam_idx * batch_size + batch_idx] = in[batch_idx * vocab_size + target_idx];
+}
+
+void invokeLLaMAExtractTargets(float*       out,
+                               float*       in,
+                               const int*   target_ids,
+                               const int*   cu_seqlens,
+                               int          beam_width,
+                               int          batch_size,
+                               int          vocab_size,
+                               int          num_tokens,
+                               cudaStream_t stream)
+{
+    dim3 block(32, 4);
+    dim3 grid((batch_size + block.x - 1) / block.x, (beam_width + block.y - 1) / block.y);
+    LLaMA_extract_targets<<<grid, block, 0, stream>>>(
+        out, in, target_ids, cu_seqlens, beam_width, batch_size, vocab_size, num_tokens);
+}
+
 __global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size)
 {
     // logits [T, V]
@@ -61,49 +125,26 @@ void invokeLLaMALogSoftmax(
 
 __global__ void LLaMA_gather_tokens_kernel(float*       out,
                                            const float* probs,
-                                           const int*   input_ids,
                                            const int*   input_lengths,
+                                           const int*   target_ids,
                                            const int*   cu_seqlens,
                                            const int    batch_size,
-                                           const int    vocab_size)
+                                           const int    vocab_size,
+                                           const int    num_tokens)
 {
-    /*
-    // probs: [T, V]
-    // input_ids: [T]
-    int batch_idx = blockIdx.x;
-
-    if (batch_idx >= batch_size)
-        return;
-
-    float val = 0.f;
-    //    for (int i = cu_seqlens[batch_idx] + threadIdx.x; i < cu_seqlens[batch_idx + 1] - 1; i += blockDim.x) {
-    //        int input_idx = input_ids[i + 1];
-    //        val += probs[i * vocab_size + input_idx];
-    //    }
-    for (int t = cu_seqlens[batch_idx]; t < cu_seqlens[batch_idx + 1] - 1; ++t) {
-        val += probs[t * vocab_size + input_ids[t + 1]];
-    }
-    //float sum = blockReduceSum<float>(val);
-
-    if (threadIdx.x == 0)
-        out[batch_idx] = val;
-        */
-    // for b in range(bsz):
-    //     for i in range(choice_seq_lens_list[c][b]-1):
-    //       t = choice_cum_seq_lens_list[c][b] + i
-    //       choice_log_probs[b, c] = choice_log_probs[b, c] +  log_likelihoods[t, choice_tokens_list[c][t+1]]
-
     // probs: [T, V]
-    // input_ids: [T]
+    // target_ids: [T]
+    // out: [batch_size]
     int batch_idx = blockIdx.x;
 
     if (batch_idx >= batch_size)
         return;
 
     float val = 0.f;
-    for (int i = threadIdx.x; i < input_lengths[batch_idx] - 1; i += blockDim.x) {
-        int t = cu_seqlens[batch_idx] + i;
-        val += probs[t * vocab_size + input_ids[t + 1]];
+    for (int i = threadIdx.x; i < input_lengths[batch_idx]; i += blockDim.x) {
+        int pos        = cu_seqlens[batch_idx] + i;
+        int target_pos = target_ids[pos];
+        val += (target_pos > 0) ? probs[pos * vocab_size + target_pos] : 0.f;
     }
     float sum = blockReduceSum<float>(val);
 
@@ -113,15 +154,18 @@ __global__ void LLaMA_gather_tokens_kernel(float*       out,
 
 void invokeLLaMAGatherTokens(float*       out,
                              const float* probs,
-                             const int*   input_ids,
                              const int*   input_lengths,
+                             const int*   target_ids,
                              const int*   cu_seqlens,
                              const int    batch_size,
                              const int    vocab_size,
+                             const int    num_tokens,
                              cudaStream_t stream)
 {
-    LLaMA_gather_tokens_kernel<<<batch_size, 256, 0, stream>>>(
-        out, probs, input_ids, input_lengths, cu_seqlens, batch_size, vocab_size);
+    dim3 grid(batch_size);
+    dim3 block(256);
+    LLaMA_gather_tokens_kernel<<<grid, block, 0, stream>>>(
+        out, probs, input_lengths, target_ids, cu_seqlens, batch_size, vocab_size, num_tokens);
 }
 
 template<typename T>
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index 66488462d..754ed6bba 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -32,13 +32,28 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream)
 
 void invokeLLaMAGatherTokens(float*       out,
                              const float* probs,
-                             const int*   input_ids,
                              const int*   input_lengths,
+                             const int*   target_ids,
                              const int*   cu_seqlens,
                              const int    batch_size,
                              const int    vocab_size,
+                             const int    num_tokens,
                              cudaStream_t stream);
 
 void invokeLLaMALogSoftmax(
     float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream);
+
+template<typename T>
+void invokeLLaMAGetLastTokens(
+    T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+
+void invokeLLaMAExtractTargets(float*       out,
+                               float*       in,
+                               const int*   target_ids,
+                               const int*   cu_seqlens,
+                               int          beam_width,
+                               int          batch_size,
+                               int          vocab_size,
+                               int          num_tokens,
+                               cudaStream_t stream);
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index b55e04149..f0f6d4697 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -86,6 +86,8 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     }
 #endif
 
+    context_output_buf_ =
+        (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
     normed_decoder_output_buf_ =
         (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     logits_buf_ =
@@ -106,6 +108,7 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&key_cache_));
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
+        allocator_->free((void**)(&context_output_buf_));
 #ifdef USE_NCCL
         for (int i = 0; i < num_buffers_; ++i) {
             allocator_->free((void**)(&context_decoder_output_buf_clone_[i]));
@@ -231,8 +234,8 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // input_tensors:
     //      input_ids [num_tokens]
     //      input_lengths [batch_size]
+    //      target_ids [beam_width, num_tokens]
     //      context_lengths [batch_size]
-    //      num_tokens [1] int on cpu
     //      seq_len [1] int on cpu
     //      attn_len [1] int on cpu
     //      is_context [1] int on cpu
@@ -240,25 +243,30 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     // output_tensors:
     //      hidden_vector [num_tokens, hidden_size]
     //      log_probs [num_tokens, vocab_size]
-    //      cum_probs [batch_size]
+    //      cum_probs [beam_width, batch_size]
 
-    FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 6");
-    FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
+    FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7");
+    FT_CHECK(input_tensors->at("input_ids").shape.size() == 1);
     FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
+    FT_CHECK(input_tensors->at("target_ids").shape.size() == 2);
+    FT_CHECK(input_tensors->at("context_lengths").shape.size() == 1);
 
     const DataType data_type       = getTensorType<T>();
     const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
     const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
-    const int*     input_ids       = input_tensors->at("input_ids").getPtr<int>();
-    const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
-    const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
-    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
-    const int      seq_len         = input_tensors->at("seq_len").getVal<int>();
-    const int      attn_len        = input_tensors->at("attn_len").getVal<int>();
-    const int      is_context      = input_tensors->at("is_context").getVal<int>();
-    T*             hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
-    float*         log_probs       = output_tensors->at("log_probs").getPtr<float>();
-    float*         cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
+    const size_t   num_tokens      = input_tensors->at("input_ids").shape[0];
+    const size_t   beam_width      = input_tensors->at("target_ids").shape[0];
+
+    const int* input_ids       = input_tensors->at("input_ids").getPtr<int>();
+    const int* input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int* target_ids      = input_tensors->at("target_ids").getPtr<int>();
+    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int  seq_len         = input_tensors->at("seq_len").getVal<int>();
+    const int  attn_len        = input_tensors->at("attn_len").getVal<int>();
+    const int  is_context      = input_tensors->at("is_context").getVal<int>();
+    T*         hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
+    float*     log_probs       = output_tensors->at("log_probs").getPtr<float>();
+    float*     cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
 
     allocateBuffer(batch_size, seq_len, max_seq_len_);
     sync_check_cuda_error();
@@ -294,7 +302,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"decoder_input",
          Tensor{MEMORY_GPU,
                 data_type,
-                {(size_t)num_tokens, hidden_units_},
+                {num_tokens, hidden_units_},
 #ifdef USE_NCCL
                 context_decoder_input_buf_
 #else
@@ -305,22 +313,19 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
         {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}},
-        {"num_tokens", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &num_tokens}},
         {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}},
         {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}};
 
     if (is_unpadded_mha) {
-        decoder_input_tensors.insert(
-            {"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)num_tokens}, padding_offset_}});
-        decoder_input_tensors.insert(
-            {"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens_}});
+        decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}});
+        decoder_input_tensors.insert({"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_seqlens_}});
     }
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
         {"decoder_output",
          Tensor{MEMORY_GPU,
                 data_type,
-                {(size_t)num_tokens, hidden_units_},
+                {num_tokens, hidden_units_},
 #ifdef USE_NCCL
                 context_decoder_output_buf_
 #else
@@ -369,7 +374,45 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         ftNcclGroupEnd();
 #endif
     }
-    else if (!is_context) {
+    else if (is_context) {
+        invokeLLaMAGetLastTokens(
+            context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_);
+        sync_check_cuda_error();
+
+        invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
+                                    context_output_buf_,
+                                    llama_weights->post_decoder_layernorm.gamma,
+                                    layernorm_eps_,
+                                    batch_size,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+        cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F);
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              vocab_size_,
+                              batch_size,
+                              hidden_units_,
+                              llama_weights->post_decoder_embedding.kernel,
+                              vocab_size_,
+                              normed_decoder_output_buf_,
+                              hidden_units_,  // n
+                              logits_buf_,
+                              vocab_size_);
+        sync_check_cuda_error();
+        cublas_wrapper_->setFP16GemmConfig();
+
+        invokeLLaMALogSoftmax(log_probs, logits_buf_, batch_size, vocab_size_, stream_);
+        sync_check_cuda_error();
+
+        invokeLLaMAExtractTargets(
+            cum_probs, log_probs, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_);
+        sync_check_cuda_error();
+    }
+    else {
         invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
                                     context_decoder_output_buf_,
                                     llama_weights->post_decoder_layernorm.gamma,
@@ -400,7 +443,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
 
         invokeLLaMAGatherTokens(
-            cum_probs, log_probs, input_ids, input_lengths, cu_seqlens_, batch_size, vocab_size_, stream_);
+            cum_probs, log_probs, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_);
         sync_check_cuda_error();
     }
 }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 7bb73c524..592861008 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -46,7 +46,7 @@ class LLaMA: public BaseLayer {
     cudaStream_t         comm_stream_;
     cudaEvent_t          kern_event_[num_buffers_];
     cudaEvent_t          comm_event_[num_buffers_];
-    T* context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
+    T*                   context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
 #endif
 
     static constexpr float layernorm_eps_ = 1e-6f;
@@ -77,12 +77,13 @@ class LLaMA: public BaseLayer {
     T*   value_cache_          = nullptr;
 
     T*     decoder_output_buf_        = nullptr;
+    T*     context_output_buf_        = nullptr;
     T*     normed_decoder_output_buf_ = nullptr;
     float* logits_buf_                = nullptr;
     float* log_likelihood_buf_        = nullptr;
 
-    T* context_decoder_input_buf_                      = nullptr;
-    T* context_decoder_output_buf_                     = nullptr;
+    T* context_decoder_input_buf_  = nullptr;
+    T* context_decoder_output_buf_ = nullptr;
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 4a257a405..5c90f303e 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -175,9 +175,8 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
                                                               {"attention_mask", input_tensors->at(1)},
                                                               {"input_lengths", input_tensors->at(2)},
                                                               {"context_lengths", input_tensors->at(3)},
-                                                              {"num_tokens", input_tensors->at(4)},
-                                                              {"seq_len", input_tensors->at(5)},
-                                                              {"attn_len", input_tensors->at(6)}};
+                                                              {"seq_len", input_tensors->at(4)},
+                                                              {"attn_len", input_tensors->at(5)}};
     std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
                                                                {"key_cache", output_tensors->at(1)},
                                                                {"value_cache", output_tensors->at(2)}};
@@ -195,7 +194,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      attention_mask [batch_size, 1, seq_len, attn_len]
     //      input_lengths [batch_size]
     //      context_lengths [batch_size]
-    //      num_tokens [1] int on cpu
     //      seq_len [1] int on cpu
     //      attn_len [1] int on cpu
     //      padding_offset [batch_size] int on cpu
@@ -210,20 +208,20 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     FT_CHECK(output_tensors->size() == 3);
     const DataType data_type       = getTensorType<T>();
     const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
-    const int      batch_size      = input_tensors->at("input_lengths").shape[0];
-    const int*     input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
-    const int*     context_lengths = input_tensors->at("context_lengths").getPtr<int>();
-    const int      num_tokens      = input_tensors->at("num_tokens").getVal<int>();
-    const int      seq_len         = input_tensors->at("attention_mask").shape[2];
-    const int      attn_len        = input_tensors->at("attention_mask").shape[3];
-    const int*     padding_offset  = nullptr;
-    const int*     cu_seqlens      = nullptr;
+    const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
+    const size_t   num_tokens      = input_tensors->at("decoder_input").shape[0];
+    const size_t   max_seq_len     = output_tensors->at("key_cache").shape[3];
+
+    const int* input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
+    const int  attn_len        = input_tensors->at("attention_mask").shape[3];
+    const int* padding_offset  = nullptr;
+    const int* cu_seqlens      = nullptr;
     if (is_unpadded_mha) {
         padding_offset = input_tensors->at("padding_offset").getPtr<int>();
         cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
     }
-
-    const size_t max_seq_len = output_tensors->at("key_cache").shape[3];
     allocateBuffer(batch_size, seq_len, max_seq_len);
     sync_check_cuda_error();
 
@@ -244,11 +242,6 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
-    size_t h_token_num = batch_size * seq_len;
-    if (is_unpadded_mha) {
-        h_token_num = num_tokens;
-    }
-
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l) == false) {
             continue;
@@ -270,13 +263,13 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                     layer_input,
                                     llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
                                     layernorm_eps_,
-                                    h_token_num,
+                                    num_tokens,
                                     hidden_units_,
                                     stream_);
         sync_check_cuda_error();
 
         TensorMap self_attention_input_tensors{
-            {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
+            {"input_query", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}},
             {"attention_mask",
              Tensor{MEMORY_GPU,
                     data_type,
@@ -291,7 +284,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
         if (is_unpadded_mha) {
             self_attention_input_tensors.insert("padding_offset",
-                                                Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset});
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset});
             self_attention_input_tensors.insert("cu_seqlens",
                                                 Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens});
         }
@@ -302,7 +295,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         };
 
         TensorMap self_attention_output_tensors{
-            {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
+            {"hidden_features", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, self_attn_output_}},
             {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
             {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
 
@@ -319,22 +312,21 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
             llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
             layernorm_eps_,
-            h_token_num,
+            num_tokens,
             hidden_units_,
             stream_);
         sync_check_cuda_error();
 
         TensorMap ffn_input_tensors(
-            {{"ffn_input",
-              Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}});
         TensorMap ffn_output_tensors(
-            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, layer_output}}});
+            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, layer_output}}});
         ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
 
         invokeAddBiasResidual(layer_output,
                               self_attn_output_,
                               llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
-                              h_token_num,
+                              num_tokens,
                               hidden_units_,
                               stream_);
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index b098f28f7..580b05d5c 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -78,12 +78,11 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
                                        th::Tensor&   cum_probs,
                                        th::Tensor&   input_ids,
                                        th::Tensor&   input_lengths,
+                                       th::Tensor&   target_ids,
                                        th::Tensor&   context_lengths,
-                                       const int64_t num_tokens,
                                        const int64_t seq_len,
                                        const int64_t attn_len,
-                                       const int64_t is_context
-                                       )
+                                       const int64_t is_context)
 {
     CHECK_TH_CUDA(input_ids);
     CHECK_CONTIGUOUS(input_ids);
@@ -92,9 +91,16 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
     CHECK_CONTIGUOUS(input_lengths);
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
 
-    const int batch_size = input_lengths.size(0);
-    ftllama->forward(
-        hidden_vector, log_probs, cum_probs, input_ids, input_lengths, context_lengths, num_tokens, seq_len, attn_len, is_context);
+    ftllama->forward(hidden_vector,
+                     log_probs,
+                     cum_probs,
+                     input_ids,
+                     input_lengths,
+                     target_ids,
+                     context_lengths,
+                     seq_len,
+                     attn_len,
+                     is_context);
     return std::vector<th::Tensor>{hidden_vector, log_probs, cum_probs};
 }
 
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index f5ffee6f7..029780e7f 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -34,8 +34,8 @@ class IFLLaMA {
                          th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
+                         th::Tensor& target_ids,
                          th::Tensor& context_lengths,
-                         const int   num_tokens,
                          const int   seq_len,
                          const int   attn_len,
                          const int   is_context) = 0;
@@ -184,26 +184,29 @@ class FTLLaMA: public IFLLaMA {
                          th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
+                         th::Tensor& target_ids,
                          th::Tensor& context_lengths,
-                         const int   num_tokens,
                          const int   seq_len,
                          const int   attn_len,
                          const int   is_context) override
     {
         const size_t batch_size = (size_t)input_lengths.size(0);
+        const size_t num_tokens = (size_t)input_ids.size(0);
+        const size_t beam_width = (size_t)target_ids.size(0);
 
         std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
             {"input_ids",
-             ft::Tensor{ft::MEMORY_GPU,
-                        ft::TYPE_INT32,
-                        std::vector<size_t>{batch_size, (size_t)seq_len},
-                        get_ptr<int>(input_ids)}},
+             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{num_tokens}, get_ptr<int>(input_ids)}},
             {"input_lengths",
              ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
+            {"target_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{beam_width, num_tokens},
+                        get_ptr<int>(target_ids)}},
             {"context_lengths",
              ft::Tensor{
                  ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
-            {"num_tokens", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &num_tokens}},
             {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &seq_len}},
             {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}},
             {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &is_context}}};
@@ -212,15 +215,18 @@ class FTLLaMA: public IFLLaMA {
             {"hidden_vector",
              ft::Tensor{ft::MEMORY_GPU,
                         (std::is_same<T, half>::value) ? ft::TYPE_FP16 : ft::TYPE_FP32,
-                        std::vector<size_t>{(size_t)num_tokens, num_heads_ * size_per_head_},
+                        std::vector<size_t>{num_tokens, num_heads_ * size_per_head_},
                         get_ptr<T>(hidden_vector)}},
             {"log_probs",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_FP32,
-                        std::vector<size_t>{(size_t)num_tokens, vocab_size_},
+                        std::vector<size_t>{num_tokens, vocab_size_},
                         get_ptr<float>(log_probs)}},
             {"cum_probs",
-             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP32, std::vector<size_t>{batch_size}, get_ptr<float>(cum_probs)}}};
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_FP32,
+                        std::vector<size_t>{beam_width, batch_size},
+                        get_ptr<float>(cum_probs)}}};
 
         try {
             ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_]));
@@ -294,8 +300,8 @@ class LLaMA: public th::jit::CustomClassHolder {
                                     th::Tensor&   cum_probs,
                                     th::Tensor&   input_ids,
                                     th::Tensor&   input_lengths,
+                                    th::Tensor&   target_ids,
                                     th::Tensor&   context_lengths,
-                                    const int64_t num_tokens,
                                     const int64_t seq_len,
                                     const int64_t attn_len,
                                     const int64_t is_context);

From a7e708917662079259a6fd9d3c545a90893c3ceb Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 1 Oct 2023 03:24:28 +0000
Subject: [PATCH 51/55] ref

---
 .../kernels/llama_kernels.cu                  |  26 ++
 src/fastertransformer/kernels/llama_kernels.h |   2 +
 .../kernels/unfused_attention_kernels.cu      |  66 ++--
 .../kernels/unfused_attention_kernels.h       |  16 +-
 .../LLaMAContextAttentionLayer.cc             | 341 +++++++++---------
 .../LLaMAContextAttentionLayer.h              |   2 +-
 src/fastertransformer/models/llama/LLaMA.cc   |  35 +-
 src/fastertransformer/models/llama/LLaMA.h    |  27 +-
 .../models/llama/LLaMAContextDecoder.cc       |  20 +-
 .../models/llama/LLaMAContextDecoder.h        |   9 +-
 10 files changed, 279 insertions(+), 265 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index d007350fe..d6d119227 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -327,4 +327,30 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream)
 template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream);
 template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream);
 
+template<typename T>
+__global__ void LLaMAMemset0Kernel(T* dst, const int count)
+{
+    int           idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
+    if (idx * X_ELEMS >= count) {
+        return;
+    }
+
+    auto v_dst = reinterpret_cast<uint4*>(dst);
+    v_dst[idx] = {0};
+}
+
+template<typename T>
+void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    assert(count % x == 0);
+    int grid_sz = (count / x + block_sz - 1) / block_sz;
+    LLaMAMemset0Kernel<<<grid_sz, block_sz, 0, stream>>>(dst, count);
+}
+
+template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream);
+template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream);
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index 754ed6bba..f0d356a09 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -29,6 +29,8 @@ void invokeLLaMAInputIdsEmbeddingLookup(T*           from_tensor,
 
 template<typename T>
 void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
+template<typename T>
+void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream);
 
 void invokeLLaMAGatherTokens(float*       out,
                              const float* probs,
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 2f867186e..a513c1b47 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1902,11 +1902,11 @@ __global__ void transpose_4d_save_to_cache(T*         k_dst,
                                            const T*   k_src,
                                            T*         v_dst,
                                            const T*   v_src,
+                                           const int* context_lengths,
                                            const int  head_num,
                                            const int  size_per_head,
                                            const int  seq_len,
-                                           const int  max_seq_len,
-                                           const int* context_lengths)
+                                           const int  max_seq_len)
 {
     // [batch_size, head_num, seq_len, size_per_head]
     const int batch_id = blockIdx.y;
@@ -1943,20 +1943,20 @@ void invokeLLaMASaveToCache(T*           k_dst,
                             T*           v_dst,
                             const T*     k_src,
                             const T*     v_src,
-                            const int    local_batch_size,
+                            const int*   context_lengths,
+                            const int    batch_size,
+                            const int    head_num,
+                            const int    size_per_head,
                             const int    seq_len,
                             const int    max_seq_len,
-                            const int    size_per_head,
-                            const int    local_head_num,
-                            const int*   context_lengths,
                             cudaStream_t stream)
 {
     constexpr int block_sz = 128;
     constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-    dim3          grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+    dim3          grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
 
     transpose_4d_save_to_cache<<<grid, block_sz, 0, stream>>>(
-        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, context_lengths);
+        k_dst, k_src, v_dst, v_src, context_lengths, head_num, size_per_head, seq_len, max_seq_len);
 }
 
 #define INSTANTIATESAVETOCACHE(T)                                                                                      \
@@ -1964,12 +1964,12 @@ void invokeLLaMASaveToCache(T*           k_dst,
                                          T*           v_dst,                                                           \
                                          const T*     k_src,                                                           \
                                          const T*     v_src,                                                           \
-                                         const int    local_batch_size,                                                \
+                                         const int*   context_lengths,                                                 \
+                                         const int    batch_size,                                                      \
+                                         const int    head_num,                                                        \
+                                         const int    size_per_head,                                                   \
                                          const int    seq_len,                                                         \
                                          const int    max_seq_len,                                                     \
-                                         const int    size_per_head,                                                   \
-                                         const int    local_head_num,                                                  \
-                                         const int*   start_pos,                                                       \
                                          cudaStream_t stream)
 INSTANTIATESAVETOCACHE(float);
 INSTANTIATESAVETOCACHE(half);
@@ -1979,19 +1979,19 @@ INSTANTIATESAVETOCACHE(__nv_bfloat16);
 #undef INSTANTIATESAVETOCACHE
 
 template<typename T>
-__global__ void transpose_4d_load_from_cache(T*         k_dst,
-                                             const T*   k_src,
-                                             T*         v_dst,
-                                             const T*   v_src,
-                                             const int  head_num,
-                                             const int  size_per_head,
-                                             const int  seq_len,
-                                             const int  max_seq_len,
-                                             const int  attn_len)
+__global__ void transpose_4d_load_from_cache(T*        k_dst,
+                                             T*        v_dst,
+                                             const T*  k_src,
+                                             const T*  v_src,
+                                             const int head_num,
+                                             const int size_per_head,
+                                             const int seq_len,
+                                             const int attn_len,
+                                             const int max_seq_len)
 {
     // [batch_size, head_num, attn_len, size_per_head]
-    const int batch_id     = blockIdx.y;
-    const int head_id      = blockIdx.z;
+    const int batch_id = blockIdx.y;
+    const int head_id  = blockIdx.z;
 
     // 16 byte loads will handle "x" dimension
     auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * max_seq_len
@@ -2022,20 +2022,20 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                               T*           v_dst,
                               const T*     k_src,
                               const T*     v_src,
-                              const int    local_batch_size,
-                              const int    seq_len,
-                              const int    max_seq_len,
+                              const int    batch_size,
+                              const int    head_num,
                               const int    size_per_head,
-                              const int    local_head_num,
+                              const int    seq_len,
                               const int    attn_len,
+                              const int    max_seq_len,
                               cudaStream_t stream)
 {
     constexpr int block_sz = 128;
     constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-    dim3          grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+    dim3          grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
 
     transpose_4d_load_from_cache<<<grid, block_sz, 0, stream>>>(
-        k_dst, k_src, v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len, attn_len);
+        k_dst, v_dst, k_src, v_src, head_num, size_per_head, seq_len, attn_len, max_seq_len);
 }
 
 #define INSTANTIATELOADFROMCACHE(T)                                                                                    \
@@ -2043,12 +2043,12 @@ void invokeLLaMALoadFromCache(T*           k_dst,
                                            T*           v_dst,                                                         \
                                            const T*     k_src,                                                         \
                                            const T*     v_src,                                                         \
-                                           const int    local_batch_size,                                              \
+                                           const int    batch_size,                                                    \
+                                           const int    head_num,                                                      \
+                                           const int    size_per_head,                                                 \
                                            const int    seq_len,                                                       \
+                                           const int    attn_len,                                                      \
                                            const int    max_seq_len,                                                   \
-                                           const int    size_per_head,                                                 \
-                                           const int    local_head_num,                                                \
-                                           const int    attn_len,                                                    \
                                            cudaStream_t stream)
 INSTANTIATELOADFROMCACHE(float);
 INSTANTIATELOADFROMCACHE(half);
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 52fa0f053..4f55af19e 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -209,24 +209,24 @@ void invokeLLaMASaveToCache(T*           k_dst,
                             T*           v_dst,
                             const T*     k_src,
                             const T*     v_src,
-                            const int    local_batch_size,
+                            const int*   context_lengths,
+                            const int    batch_size,
+                            const int    head_num,
+                            const int    size_per_head,
                             const int    seq_len,
                             const int    max_seq_len,
-                            const int    size_per_head,
-                            const int    local_head_num,
-                            const int*   start_pos,
                             cudaStream_t stream);
 template<typename T>
 void invokeLLaMALoadFromCache(T*           k_dst,
                               T*           v_dst,
                               const T*     k_src,
                               const T*     v_src,
-                              const int    local_batch_size,
-                              const int    seq_len,
-                              const int    max_seq_len,
+                              const int    batch_size,
+                              const int    head_num,
                               const int    size_per_head,
-                              const int    local_head_num,
+                              const int    seq_len,
                               const int    attn_len,
+                              const int    max_seq_len,
                               cudaStream_t stream);
 
 template<typename T>
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index c8777c4cc..10d7c7673 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -31,12 +31,10 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     // input_tensors:
     //      input_query [num_tokens, hidden_dimension]
     //      attention_mask [batch_size, 1, seq_len, attn_len]
-    //      attention_type [1]
-    //      layer_id [1], int on cpu
     //      context_lengths, int, [batch_size]
-    //      attn_len, int, [batch_size] on cpu
-    //      padding_offset, int, [num_tokens] (optional)
-    //      cu_seqlens, int, [batch_size] (optional)
+    //      attention_type [1]
+    //      padding_offset [num_tokens] (optional)
+    //      cu_seqlens [batch_size+1] (optional)
 
     // output_tensors:
     //      hidden_features [num_tokens, hidden_dimension]
@@ -46,25 +44,27 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
     FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
-    const int  batch_size      = input_tensors->at("attention_mask").shape[0];
-    const int  max_seq_len     = (int)(output_tensors->at("key_cache").shape[2]);
-    const int  layer_id        = input_tensors->getVal<int>("layer_id");
-    const int* padding_offset  = input_tensors->getPtr<int>("padding_offset", nullptr);
-    const int* cu_seqlens      = input_tensors->getPtr<int>("cu_seqlens", nullptr);
-    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
-    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
-    const int  attn_len        = input_tensors->at("attention_mask").shape[3];
-
-    T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
-    T* attention_input = input_tensors->at("input_query").getPtr<T>();
-    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
-
-    const AttentionType attention_type = input_tensors->getVal<AttentionType>("attention_type");
+    const int batch_size  = input_tensors->at("attention_mask").shape[0];
+    const int seq_len     = input_tensors->at("attention_mask").shape[2];
+    const int attn_len    = input_tensors->at("attention_mask").shape[3];
+    const int max_seq_len = output_tensors->at("key_cache").shape[2];
+
+    T*                  attention_input = input_tensors->at("input_query").getPtr<T>();
+    T*                  attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+    const int*          context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int*          padding_offset  = input_tensors->getPtr<int>("padding_offset", nullptr);
+    const int*          cu_seqlens      = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const AttentionType attention_type  = input_tensors->getVal<AttentionType>("attention_type");
+    T*                  attention_out   = output_tensors->at("hidden_features").getPtr<T>();
+    T*                  key_cache       = output_tensors->getPtr<T>("key_cache");
+    T*                  value_cache     = output_tensors->getPtr<T>("value_cache");
+
+    FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len");
     FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA,
                        "LLaMA Context FUSED_PADDED_MHA is not supported !");
 
     PUSH_RANGE("attention buffer alloc");
-    allocateBuffer(batch_size, seq_len, max_seq_len, attention_type != AttentionType::FUSED_MHA);
+    allocateBuffer(batch_size, seq_len, attn_len, max_seq_len, attention_type != AttentionType::FUSED_MHA);
     POP_RANGE;
     sync_check_cuda_error();
 
@@ -85,11 +85,11 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                           3 * hidden_units_ /* n */);
     sync_check_cuda_error();
 
-//    if (padding_offset != nullptr) {
-//        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-//        cudaMemsetAsync(q_buf_2_, 0, batch_size * max_seq_len * 3 * hidden_units_ * sizeof(T), stream_);
-//        sync_check_cuda_error();
-//    }
+    if (padding_offset != nullptr) {
+        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
+        cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_);
+        sync_check_cuda_error();
+    }
 
     invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
                                         k_buf_2_,
@@ -106,177 +106,169 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                         stream_);
     sync_check_cuda_error();
 
-    // key_cache [batch, local_head_num, max_seq_len, size_per_head]
-    // value_cache [batch, local_head_num, max_seq_len, size_per_head]
-    T* key_cache   = output_tensors->getPtr<T>("key_cache");
-    T* value_cache = output_tensors->getPtr<T>("value_cache");
     invokeLLaMASaveToCache(key_cache,
                            value_cache,
                            k_buf_2_,
                            v_buf_2_,
+                           context_lengths,
                            batch_size,
+                           head_num_,
+                           size_per_head_,
                            seq_len,
                            max_seq_len,
-                           size_per_head_,
-                           head_num_,
-                           context_lengths,
                            stream_);
     sync_check_cuda_error();
-    POP_RANGE;
 
     invokeLLaMALoadFromCache(k_buf_2_,
                              v_buf_2_,
                              key_cache,
                              value_cache,
                              batch_size,
-                             seq_len,
-                             max_seq_len,
-                             size_per_head_,
                              head_num_,
+                             size_per_head_,
+                             seq_len,
                              attn_len,
+                             max_seq_len,
                              stream_);
+    sync_check_cuda_error();
 
-    if (attention_type == AttentionType::FUSED_MHA) {
-        dispatcher_fp16->setup_causal_masked_fmha(seq_len, batch_size);
-        dispatcher_fp16->run_causal_masked_fmha(qkv_buf_, cu_seqlens, qkv_buf_3_, true, stream_);
+    POP_RANGE;
+
+    const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
+    const int            attention_seq_len_1 = seq_len;   // q length
+    const int            attention_seq_len_2 = attn_len;  // kv length
+    const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
+
+    //
+    // softmax(Q*K^T)
+    //
+    if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
+        PUSH_RANGE("Q*K batch gemm");
+
+        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                            CUBLAS_OP_N,
+                                            attention_seq_len_2,  // n
+                                            attention_seq_len_1,  // m
+                                            size_per_head_,       // k
+                                            1.0f,
+                                            k_buf_2_,
+                                            gemm_data_type,
+                                            size_per_head_,                        // k
+                                            attention_seq_len_2 * size_per_head_,  // n * k
+                                            q_buf_2_,
+                                            gemm_data_type,
+                                            size_per_head_,                        // k
+                                            attention_seq_len_1 * size_per_head_,  // m * k
+                                            0.0f,
+                                            qk_buf_float_,
+                                            CUDA_R_32F,
+                                            attention_seq_len_2,  // n
+                                            attention_seq_len_2 * attention_seq_len_1,
+                                            batch_size * head_num_,  // global batch size
+                                            CUDA_R_32F);
+        sync_check_cuda_error();
+        POP_RANGE;
+
+        PUSH_RANGE("softmax");
+        MaskedSoftmaxParam<T, float> param;
+        param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+        param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
+        param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+        param.batch_size         = batch_size;
+        param.q_length           = attention_seq_len_1;
+        param.k_length           = attention_seq_len_2;
+        param.num_heads          = head_num_;
+        param.qk_scale           = qk_scale;
+        param.linear_bias_slopes = nullptr;
+        invokeMaskedSoftmax(param, stream_);
+        sync_check_cuda_error();
+        POP_RANGE;
     }
     else {
-        const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
-        const int            attention_seq_len_1 = seq_len;   // q length
-        const int            attention_seq_len_2 = attn_len;  // kv length
-        const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
-
-        //
-        // softmax(Q*K^T)
-        //
-        if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
-            PUSH_RANGE("Q*K batch gemm");
-
-            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                                CUBLAS_OP_N,
-                                                attention_seq_len_2,  // n
-                                                attention_seq_len_1,  // m
-                                                size_per_head_,       // k
-                                                1.0f,
-                                                k_buf_2_,
-                                                gemm_data_type,
-                                                size_per_head_,                        // k
-                                                attention_seq_len_2 * size_per_head_,  // n * k
-                                                q_buf_2_,
-                                                gemm_data_type,
-                                                size_per_head_,                        // k
-                                                attention_seq_len_1 * size_per_head_,  // m * k
-                                                0.0f,
-                                                qk_buf_float_,
-                                                CUDA_R_32F,
-                                                attention_seq_len_2,  // n
-                                                attention_seq_len_2 * attention_seq_len_1,
-                                                batch_size * head_num_,  // global batch size
-                                                CUDA_R_32F);
-            sync_check_cuda_error();
-            POP_RANGE;
-
-            PUSH_RANGE("softmax");
-            MaskedSoftmaxParam<T, float> param;
-            param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-            param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
-            param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-            param.batch_size         = batch_size;
-            param.q_length           = attention_seq_len_1;
-            param.k_length           = attention_seq_len_2;
-            param.num_heads          = head_num_;
-            param.qk_scale           = qk_scale;
-            param.linear_bias_slopes = nullptr;
-            invokeMaskedSoftmax(param, stream_);
-            sync_check_cuda_error();
-            POP_RANGE;
-        }
-        else {
-            PUSH_RANGE("Q*K batch gemm");
-            cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                                CUBLAS_OP_N,
-                                                attention_seq_len_2,
-                                                attention_seq_len_1,
-                                                size_per_head_,
-                                                k_buf_2_,
-                                                size_per_head_,
-                                                attention_seq_len_2 * size_per_head_,
-                                                q_buf_2_,
-                                                size_per_head_,
-                                                attention_seq_len_1 * size_per_head_,
-                                                qk_buf_,
-                                                attention_seq_len_2,
-                                                attention_seq_len_2 * attention_seq_len_1,
-                                                batch_size * head_num_);
-
-            POP_RANGE;
-            PUSH_RANGE("softmax");
-            MaskedSoftmaxParam<T, T> param;
-            param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-            param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-            param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-            param.batch_size         = batch_size;
-            param.q_length           = attention_seq_len_1;
-            param.k_length           = attention_seq_len_2;
-            param.num_heads          = head_num_;
-            param.qk_scale           = qk_scale;
-            param.linear_bias_slopes = nullptr;
-            invokeMaskedSoftmax(param, stream_);
-            sync_check_cuda_error();
-            POP_RANGE;
-        }
-
-        PUSH_RANGE("QK*V batch gemm");
-        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+        PUSH_RANGE("Q*K batch gemm");
+        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
                                             CUBLAS_OP_N,
-                                            size_per_head_,
-                                            attention_seq_len_1,
                                             attention_seq_len_2,
-
-                                            v_buf_2_,
+                                            attention_seq_len_1,
+                                            size_per_head_,
+                                            k_buf_2_,
                                             size_per_head_,
                                             attention_seq_len_2 * size_per_head_,
-
-                                            qk_buf_,
-                                            attention_seq_len_2,
-                                            attention_seq_len_1 * attention_seq_len_2,
-
-                                            qkv_buf_2_,
+                                            q_buf_2_,
                                             size_per_head_,
                                             attention_seq_len_1 * size_per_head_,
-
+                                            qk_buf_,
+                                            attention_seq_len_2,
+                                            attention_seq_len_2 * attention_seq_len_1,
                                             batch_size * head_num_);
-        sync_check_cuda_error();
 
-        // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
-        if (padding_offset == nullptr) {
-            invokeTransposeQKV(qkv_buf_3_,
-                               qkv_buf_2_,
-                               batch_size,
-                               attention_seq_len_1,
-                               head_num_,
-                               size_per_head_,
-                               attention_weights->attention_output_weight.scale,
-                               0,  // int8_mode
-                               stream_);
-            sync_check_cuda_error();
-        }
-        else {
-            invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
-                                                     qkv_buf_3_,
-                                                     num_tokens,
-                                                     batch_size,
-                                                     attention_seq_len_1,
-                                                     head_num_,
-                                                     size_per_head_,
-                                                     padding_offset,
-                                                     attention_weights->attention_output_weight.scale,
-                                                     0,  // int8_mode
-                                                     stream_);
-            sync_check_cuda_error();
-        }
+        POP_RANGE;
+        PUSH_RANGE("softmax");
+        MaskedSoftmaxParam<T, T> param;
+        param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+        param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+        param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+        param.batch_size         = batch_size;
+        param.q_length           = attention_seq_len_1;
+        param.k_length           = attention_seq_len_2;
+        param.num_heads          = head_num_;
+        param.qk_scale           = qk_scale;
+        param.linear_bias_slopes = nullptr;
+        invokeMaskedSoftmax(param, stream_);
+        sync_check_cuda_error();
         POP_RANGE;
     }
+
+    PUSH_RANGE("QK*V batch gemm");
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        size_per_head_,
+                                        attention_seq_len_1,
+                                        attention_seq_len_2,
+
+                                        v_buf_2_,
+                                        size_per_head_,
+                                        attention_seq_len_2 * size_per_head_,
+
+                                        qk_buf_,
+                                        attention_seq_len_2,
+                                        attention_seq_len_1 * attention_seq_len_2,
+
+                                        qkv_buf_2_,
+                                        size_per_head_,
+                                        attention_seq_len_1 * size_per_head_,
+
+                                        batch_size * head_num_);
+    sync_check_cuda_error();
+
+    // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
+    if (padding_offset == nullptr) {
+        invokeTransposeQKV(qkv_buf_3_,
+                           qkv_buf_2_,
+                           batch_size,
+                           attention_seq_len_1,
+                           head_num_,
+                           size_per_head_,
+                           attention_weights->attention_output_weight.scale,
+                           0,  // int8_mode
+                           stream_);
+        sync_check_cuda_error();
+    }
+    else {
+        invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                                 qkv_buf_3_,
+                                                 num_tokens,
+                                                 batch_size,
+                                                 attention_seq_len_1,
+                                                 head_num_,
+                                                 size_per_head_,
+                                                 padding_offset,
+                                                 attention_weights->attention_output_weight.scale,
+                                                 0,  // int8_mode
+                                                 stream_);
+        sync_check_cuda_error();
+    }
+    POP_RANGE;
     sync_check_cuda_error();
 
     PUSH_RANGE("proj gemm");
@@ -387,21 +379,18 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
-                                                   size_t seq_len,
-                                                   size_t max_seq_len,
-                                                   bool   allocate_qk_buf)
+void LLaMAContextAttentionLayer<T>::allocateBuffer(
+    size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * max_seq_len * 3 * hidden_units_, false);
-    k_buf_2_ = q_buf_2_ + batch_size * max_seq_len * hidden_units_;
-    v_buf_2_ = k_buf_2_ + batch_size * max_seq_len * hidden_units_;
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false);
+    k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_;
 
     // save memory usage when using fmha
     if (allocate_qk_buf) {
-        qk_buf_ =
-            (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * max_seq_len * max_seq_len, false);
+        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false);
     }
     else {
         allocator_->free((void**)(&qk_buf_));
@@ -413,7 +402,7 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
     if (is_qk_buf_float_ == true) {
         if (allocate_qk_buf) {
             qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * max_seq_len * max_seq_len, false);
+                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false);
         }
         else {
             allocator_->free((void**)(&qk_buf_float_));
@@ -431,6 +420,8 @@ void LLaMAContextAttentionLayer<T>::freeBuffer()
         FT_LOG_DEBUG(__PRETTY_FUNCTION__);
         allocator_->free((void**)(&qkv_buf_));
         allocator_->free((void**)(&q_buf_2_));
+        allocator_->free((void**)(&k_buf_2_));
+        allocator_->free((void**)(&v_buf_2_));
         allocator_->free((void**)(&qk_buf_));
         allocator_->free((void**)(&qkv_buf_2_));
         allocator_->free((void**)(&qkv_buf_3_));
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 7300186ba..4557abf1d 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -38,7 +38,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     std::unique_ptr<MHARunner> dispatcher_fp16;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len, bool allocate_qk_buf);
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf);
     void freeBuffer() override;
 
     using BaseAttentionLayer<T>::is_free_buffer_after_forward_;
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index f0f6d4697..55c507318 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -59,33 +59,29 @@ void LLaMA<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
+void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len * hidden_units_;
 
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
 
     input_attention_mask_ =
-        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * max_seq_len * max_seq_len, false));
+        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * attn_len, false));
 
-    key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
-    value_cache_ = key_cache_ + self_cache_size;
+    if (is_context) {
+        const size_t self_cache_size =
+            (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len_ * hidden_units_;
+        key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
+        value_cache_ = key_cache_ + self_cache_size;
+    }
 
     context_decoder_input_buf_ =
         (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
     context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
         context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
 
-#ifdef USE_NCCL
-    for (int i = 0; i < num_buffers_; ++i) {
-        context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc(
-            context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
-    }
-#endif
-
     context_output_buf_ =
         (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
     normed_decoder_output_buf_ =
@@ -95,6 +91,13 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_
     log_likelihood_buf_ =
         (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
 
+#ifdef USE_NCCL
+    for (int i = 0; i < num_buffers_; ++i) {
+        context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc(
+            context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
+    }
+#endif
+
     is_allocate_buffer_ = true;
 }
 
@@ -109,12 +112,14 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&context_decoder_input_buf_));
         allocator_->free((void**)(&context_decoder_output_buf_));
         allocator_->free((void**)(&context_output_buf_));
+        allocator_->free((void**)(&normed_decoder_output_buf_));
+        allocator_->free((void**)(&logits_buf_));
+        allocator_->free((void**)(&log_likelihood_buf_));
 #ifdef USE_NCCL
         for (int i = 0; i < num_buffers_; ++i) {
             allocator_->free((void**)(&context_decoder_output_buf_clone_[i]));
         }
 #endif
-        allocator_->free((void**)(&logits_buf_));
         is_allocate_buffer_ = false;
     }
 }
@@ -268,7 +273,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     float*     log_probs       = output_tensors->at("log_probs").getPtr<float>();
     float*     cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
 
-    allocateBuffer(batch_size, seq_len, max_seq_len_);
+    FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len");
+
+    allocateBuffer(batch_size, seq_len, attn_len, is_context);
     sync_check_cuda_error();
 
     if (is_unpadded_mha) {
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 592861008..ee9442158 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -64,26 +64,23 @@ class LLaMA: public BaseLayer {
     LLaMAContextDecoder<T>* llama_context_decoder_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t max_seq_len, size_t max_cache_seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context);
     void freeBuffer() override;
 
     void initialize();
 
 protected:
-    int* padding_offset_       = nullptr;
-    int* cu_seqlens_           = nullptr;
-    T*   input_attention_mask_ = nullptr;
-    T*   key_cache_            = nullptr;
-    T*   value_cache_          = nullptr;
-
-    T*     decoder_output_buf_        = nullptr;
-    T*     context_output_buf_        = nullptr;
-    T*     normed_decoder_output_buf_ = nullptr;
-    float* logits_buf_                = nullptr;
-    float* log_likelihood_buf_        = nullptr;
-
-    T* context_decoder_input_buf_  = nullptr;
-    T* context_decoder_output_buf_ = nullptr;
+    int*   padding_offset_             = nullptr;
+    int*   cu_seqlens_                 = nullptr;
+    T*     input_attention_mask_       = nullptr;
+    T*     key_cache_                  = nullptr;
+    T*     value_cache_                = nullptr;
+    T*     context_output_buf_         = nullptr;
+    T*     normed_decoder_output_buf_  = nullptr;
+    float* logits_buf_                 = nullptr;
+    float* log_likelihood_buf_         = nullptr;
+    T*     context_decoder_input_buf_  = nullptr;
+    T*     context_decoder_output_buf_ = nullptr;
 
     void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
                                         const std::unordered_map<std::string, Tensor>* input_tensors);
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 5c90f303e..2709b2164 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -60,7 +60,7 @@ void LLaMAContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len)
+void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
 {
 
     decoder_normed_input_ = reinterpret_cast<T*>(
@@ -175,8 +175,7 @@ void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*
                                                               {"attention_mask", input_tensors->at(1)},
                                                               {"input_lengths", input_tensors->at(2)},
                                                               {"context_lengths", input_tensors->at(3)},
-                                                              {"seq_len", input_tensors->at(4)},
-                                                              {"attn_len", input_tensors->at(5)}};
+                                                              {"seq_len", input_tensors->at(4)}};
     std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
                                                                {"key_cache", output_tensors->at(1)},
                                                                {"value_cache", output_tensors->at(2)}};
@@ -195,22 +194,20 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      input_lengths [batch_size]
     //      context_lengths [batch_size]
     //      seq_len [1] int on cpu
-    //      attn_len [1] int on cpu
     //      padding_offset [batch_size] int on cpu
     //      cu_seqlens [batch_size+1] int on cpu
 
     // output tensors:
     //      decoder_output [num_tokens, hidden_dimension],
     //      key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
-    //      value_cache [num_layer, batch, local_head_num, mxa_seq_len, size_per_head]
+    //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
 
-    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 3);
     const DataType data_type       = getTensorType<T>();
     const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
     const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
     const size_t   num_tokens      = input_tensors->at("decoder_input").shape[0];
-    const size_t   max_seq_len     = output_tensors->at("key_cache").shape[3];
 
     const int* input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
     const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
@@ -222,7 +219,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         padding_offset = input_tensors->at("padding_offset").getPtr<int>();
         cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
     }
-    allocateBuffer(batch_size, seq_len, max_seq_len);
+    allocateBuffer(batch_size, seq_len);
     sync_check_cuda_error();
 
     T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
@@ -250,14 +247,12 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         const bool is_final     = false;
         T*         layer_input  = decoder_layer_output_;
         T*         layer_output = decoder_layer_output_;
-        //        if (!is_unpadded_mha) {
         if (isFirstLayerParallelId(l)) {
             layer_input = decoder_input;
         }
         if (isLastLayerParallelId(l)) {
             layer_output = decoder_output;
         }
-        //        }
 
         invokeGeneralLLaMALayerNorm(decoder_normed_input_,
                                     layer_input,
@@ -275,11 +270,8 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                     data_type,
                     {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)},
                     attention_mask}},
-            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
-            {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}},
-            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, input_lengths}},
             {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}},
-            {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &attn_len}},
+            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
         };
 
         if (is_unpadded_mha) {
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index eb4e64ef0..94eb82a37 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -56,7 +56,7 @@ class LLaMAContextDecoder: public BaseLayer {
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len, size_t max_seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -67,10 +67,9 @@ class LLaMAContextDecoder: public BaseLayer {
     void initialize();
 
 protected:
-    T*      decoder_normed_input_              = nullptr;
-    T*      self_attn_output_                  = nullptr;
-    T*      decoder_layer_output_              = nullptr;
-    size_t* h_pinned_token_num_ptr_            = nullptr;
+    T* decoder_normed_input_ = nullptr;
+    T* self_attn_output_     = nullptr;
+    T* decoder_layer_output_ = nullptr;
 
 public:
     LLaMAContextDecoder(size_t           head_num,

From 5515d83db320d01158466393066189975e813163 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 1 Oct 2023 05:38:47 +0000
Subject: [PATCH 52/55] remove mpi requirement

---
 examples/cpp/CMakeLists.txt                   |   2 +-
 .../LLaMAContextAttentionLayer.cc             |  38 -----
 .../LLaMAContextAttentionLayer.h              |  21 +--
 .../models/llama/CMakeLists.txt               |   3 -
 src/fastertransformer/models/llama/LLaMA.cc   | 143 ++----------------
 src/fastertransformer/models/llama/LLaMA.h    |  66 ++------
 .../models/llama/LLaMAContextDecoder.cc       |  29 ++--
 .../models/llama/LLaMAContextDecoder.h        |  11 +-
 src/fastertransformer/th_op/llama/LLaMA.cc    |  12 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |  28 ++--
 10 files changed, 66 insertions(+), 287 deletions(-)

diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 38ae86412..800dfdd7f 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -26,8 +26,8 @@ add_subdirectory(wenet)
 
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
-add_subdirectory(llama)
 add_subdirectory(multi_gpu_gpt)
+#add_subdirectory(llama)
 
 if(ENABLE_FP8)
     add_subdirectory(gpt_fp8)
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 10d7c7673..daf3d9178 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -293,43 +293,6 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
-                                                          size_t           size_per_head,
-                                                          cudaStream_t     stream,
-                                                          cublasMMWrapper* cublas_wrapper,
-                                                          IAllocator*      allocator,
-                                                          bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float):
-    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(head_num * size_per_head),
-    rotary_embedding_dim_(0),
-    is_qk_buf_float_(is_qk_buf_float)
-{
-}
-
-template<typename T>
-LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
-                                                          size_t           size_per_head,
-                                                          size_t           local_head_num,
-                                                          cudaStream_t     stream,
-                                                          cublasMMWrapper* cublas_wrapper,
-                                                          IAllocator*      allocator,
-                                                          bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float):
-    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(head_num * size_per_head),
-    rotary_embedding_dim_(0),
-    is_qk_buf_float_(is_qk_buf_float)
-{
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
-}
-
 template<typename T>
 LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
                                                           size_t           size_per_head,
@@ -348,7 +311,6 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_
     is_qk_buf_float_(is_qk_buf_float)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, 1.0f));
 }
 
 template<typename T>
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 4557abf1d..8d24689a8 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -34,9 +34,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     const size_t rotary_embedding_dim_;
 
     // fmha runner
-    int                        sm_ = getSMVersion();
-    std::unique_ptr<MHARunner> dispatcher_fp16;
-
+    int  sm_ = getSMVersion();
     void allocateBuffer() override;
     void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf);
     void freeBuffer() override;
@@ -60,23 +58,6 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     T*     qkv_buf_3_    = nullptr;
 
 public:
-    LLaMAContextAttentionLayer(size_t           head_num,
-                               size_t           size_per_head,
-                               cudaStream_t     stream,
-                               cublasMMWrapper* cublas_wrapper,
-                               IAllocator*      allocator,
-                               bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float);
-
-    LLaMAContextAttentionLayer(size_t           head_num,
-                               size_t           size_per_head,
-                               size_t           local_head_num,
-                               cudaStream_t     stream,
-                               cublasMMWrapper* cublas_wrapper,
-                               IAllocator*      allocator,
-                               bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float);
-
     LLaMAContextAttentionLayer(size_t           head_num,
                                size_t           size_per_head,
                                size_t           local_head_num,
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
index 24acf1d78..287a350da 100644
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -44,10 +44,7 @@ set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LLaMA PUBLIC -lcudart
                       LLaMAContextDecoder
                       decoding_kernels
-                      gpt_kernels
                       llama_kernels
-                      BaseBeamSearchLayer
-                      bert_preprocess_kernels
                       tensor
                       LLaMAWeight
                       cuda_utils
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 55c507318..3884a79ac 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -15,10 +15,8 @@
  */
 
 #include "src/fastertransformer/models/llama/LLaMA.h"
-#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
 #include "src/fastertransformer/kernels/llama_kernels.h"
-#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
 #include "src/fastertransformer/utils/llama_utils.h"
 #include "src/fastertransformer/utils/memory_utils.h"
 #include <algorithm>
@@ -29,21 +27,14 @@ namespace fastertransformer {
 template<typename T>
 void LLaMA<T>::initialize()
 {
-#ifdef USE_NCCL
-    check_cuda_error(cudaStreamCreateWithFlags(&comm_stream_, cudaStreamNonBlocking));
-    for (int i = 0; i < num_buffers_; ++i) {
-        check_cuda_error(cudaEventCreate(&kern_event_[i]));
-        check_cuda_error(cudaEventCreate(&comm_event_[i]));
-    }
-#endif
-
     llama_context_decoder_ = new LLaMAContextDecoder<T>(head_num_,
                                                         size_per_head_,
                                                         inter_size_,
                                                         num_layer_,
                                                         rotary_embedding_dim_,
                                                         layernorm_eps_,
-                                                        pipeline_para_,
+                                                        rank_,
+                                                        world_size_,
                                                         stream_,
                                                         cublas_wrapper_,
                                                         allocator_,
@@ -72,7 +63,7 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len
 
     if (is_context) {
         const size_t self_cache_size =
-            (num_layer_ / pipeline_para_.world_size_) * batch_size * max_seq_len_ * hidden_units_;
+            (num_layer_ / world_size_) * batch_size * max_seq_len_ * hidden_units_;
         key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
         value_cache_ = key_cache_ + self_cache_size;
     }
@@ -91,13 +82,6 @@ void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len
     log_likelihood_buf_ =
         (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
 
-#ifdef USE_NCCL
-    for (int i = 0; i < num_buffers_; ++i) {
-        context_decoder_output_buf_clone_[i] = (T*)(allocator_->reMalloc(
-            context_decoder_output_buf_clone_[i], sizeof(T) * batch_size * max_seq_len * hidden_units_, false));
-    }
-#endif
-
     is_allocate_buffer_ = true;
 }
 
@@ -115,11 +99,6 @@ void LLaMA<T>::freeBuffer()
         allocator_->free((void**)(&normed_decoder_output_buf_));
         allocator_->free((void**)(&logits_buf_));
         allocator_->free((void**)(&log_likelihood_buf_));
-#ifdef USE_NCCL
-        for (int i = 0; i < num_buffers_; ++i) {
-            allocator_->free((void**)(&context_decoder_output_buf_clone_[i]));
-        }
-#endif
         is_allocate_buffer_ = false;
     }
 }
@@ -133,6 +112,8 @@ LLaMA<T>::LLaMA(size_t           head_num,
                 size_t           rotary_embedding_dim,
                 size_t           random_seed,
                 size_t           max_seq_len,
+                size_t           rank,
+                size_t           world_size,
                 cudaStream_t     stream,
                 cublasMMWrapper* cublas_wrapper,
                 IAllocator*      allocator,
@@ -149,41 +130,8 @@ LLaMA<T>::LLaMA(size_t           head_num,
     random_seed_(random_seed),
     max_seq_len_(max_seq_len),
     hidden_units_(head_num * size_per_head),
-    attention_type_(attention_type)
-{
-    pipeline_para_.world_size_ = 1;
-    pipeline_para_.rank_       = 0;
-    initialize();
-}
-
-template<typename T>
-LLaMA<T>::LLaMA(size_t           head_num,
-                size_t           size_per_head,
-                size_t           inter_size,
-                size_t           num_layer,
-                size_t           vocab_size,
-                size_t           rotary_embedding_dim,
-                size_t           random_seed,
-                size_t           max_seq_len,
-                NcclParam        tensor_para,
-                NcclParam        pipeline_para,
-                cudaStream_t     stream,
-                cublasMMWrapper* cublas_wrapper,
-                IAllocator*      allocator,
-                bool             is_free_buffer_after_forward,
-                cudaDeviceProp*  cuda_device_prop,
-                AttentionType    attention_type):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    inter_size_(inter_size),
-    num_layer_(num_layer),
-    vocab_size_(vocab_size),
-    rotary_embedding_dim_(rotary_embedding_dim),
-    random_seed_(random_seed),
-    max_seq_len_(max_seq_len),
-    hidden_units_(head_num * size_per_head),
-    pipeline_para_(pipeline_para),
+    rank_(rank),
+    world_size_(world_size),
     attention_type_(attention_type)
 {
     initialize();
@@ -201,7 +149,8 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
     random_seed_(llama.random_seed_),
     max_seq_len_(llama.max_seq_len_),
     hidden_units_(llama.hidden_units_),
-    pipeline_para_(llama.pipeline_para_),
+    rank_(llama.rank_),
+    world_size_(llama.world_size_),
     attention_type_(llama.attention_type_)
 {
     initialize();
@@ -210,14 +159,6 @@ LLaMA<T>::LLaMA(LLaMA<T> const& llama):
 template<typename T>
 LLaMA<T>::~LLaMA()
 {
-#ifdef USE_NCCL
-    check_cuda_error(cudaStreamDestroy(comm_stream_));
-    for (int i = 0; i < num_buffers_; ++i) {
-        check_cuda_error(cudaEventDestroy(kern_event_[i]));
-        check_cuda_error(cudaEventDestroy(comm_event_[i]));
-    }
-#endif
-
     delete llama_context_decoder_;
     freeBuffer();
 }
@@ -288,7 +229,7 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_);
     sync_check_cuda_error();
 
-    if (pipeline_para_.rank_ == 0) {
+    if (rank_ == 0) {
         invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_,
                                            llama_weights->pre_decoder_embedding_table,
                                            input_ids,
@@ -297,24 +238,13 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                            stream_);
         sync_check_cuda_error();
     }
-    else {
-#ifdef USE_NCCL
-        ftNcclRecv(
-            context_decoder_input_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, stream_);
-        sync_check_cuda_error();
-#endif
-    }
 
     std::unordered_map<std::string, Tensor> decoder_input_tensors{
         {"decoder_input",
          Tensor{MEMORY_GPU,
                 data_type,
                 {num_tokens, hidden_units_},
-#ifdef USE_NCCL
-                context_decoder_input_buf_
-#else
-                pipeline_para_.rank_ == 0                                ? context_decoder_input_buf_ : hidden_vector
-#endif
+                rank_ == 0                 ? context_decoder_input_buf_ : hidden_vector
          }},
         {"attention_mask",
          Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
@@ -333,55 +263,24 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
          Tensor{MEMORY_GPU,
                 data_type,
                 {num_tokens, hidden_units_},
-#ifdef USE_NCCL
-                context_decoder_output_buf_
-#else
-                (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector
-#endif
+                (rank_ == world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector
          }},
         {"key_cache",
          Tensor{MEMORY_GPU,
                 data_type,
-                {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
                 key_cache_}},
         {"value_cache",
          Tensor{MEMORY_GPU,
                 data_type,
-                {num_layer_ / pipeline_para_.world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
                 value_cache_}}};
 
     llama_context_decoder_->forward(
         &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
     sync_check_cuda_error();
 
-    if (pipeline_para_.rank_ < pipeline_para_.world_size_ - 1) {
-#ifdef USE_NCCL
-        //        buf_no_ = (buf_no_ + 1) % num_buffers_;
-        //        check_cuda_error(cudaEventSynchronize(comm_event_[buf_no_]));
-        //        invokeLLaMACopyKernel(context_decoder_output_buf_clone_[buf_no_],
-        //                              context_decoder_output_buf_,
-        //                              num_tokens * hidden_units_,
-        //                              stream_);
-        //        sync_check_cuda_error();
-        //        check_cuda_error(cudaEventRecord(kern_event_[buf_no_], stream_));
-        //        check_cuda_error(cudaStreamWaitEvent(comm_stream_, kern_event_[buf_no_]));
-        //        ftNcclGroupStart();
-        //        ftNcclSend(context_decoder_output_buf_clone_[buf_no_],
-        //                   num_tokens * hidden_units_,
-        //                   pipeline_para_.rank_ + 1,
-        //                   pipeline_para_,
-        //                   comm_stream_);
-        //        ftNcclGroupEnd();
-        //        check_cuda_error(cudaEventRecord(comm_event_[buf_no_], comm_stream_));
-        //        sync_check_cuda_error();
-
-        ftNcclGroupStart();
-        ftNcclSend(
-            context_decoder_output_buf_, num_tokens * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, stream_);
-        ftNcclGroupEnd();
-#endif
-    }
-    else if (is_context) {
+    if (is_context) {
         invokeLLaMAGetLastTokens(
             context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_);
         sync_check_cuda_error();
@@ -455,18 +354,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     }
 }
 
-template<typename T>
-size_t LLaMA<T>::getPipelineParallelRank()
-{
-    return pipeline_para_.rank_;
-}
-
-template<typename T>
-size_t LLaMA<T>::getPipelineParallelSize()
-{
-    return pipeline_para_.world_size_;
-}
-
 template class LLaMA<float>;
 template class LLaMA<half>;
 
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index ee9442158..7be6120ab 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -23,42 +23,26 @@
 #include "src/fastertransformer/models/llama/LLaMAWeight.h"
 #include "src/fastertransformer/utils/custom_ar_comm.h"
 
-//#define USE_NCCL
-
 namespace fastertransformer {
 
 template<typename T>
 class LLaMA: public BaseLayer {
 private:
     // meta data
-    size_t head_num_;
-    size_t size_per_head_;
-    size_t inter_size_;
-    size_t num_layer_;
-    size_t vocab_size_;
-    size_t rotary_embedding_dim_;
-    size_t random_seed_;
-    size_t max_seq_len_;
-
-#ifdef USE_NCCL
-    static constexpr int num_buffers_ = 5;
-    int                  buf_no_      = 0;
-    cudaStream_t         comm_stream_;
-    cudaEvent_t          kern_event_[num_buffers_];
-    cudaEvent_t          comm_event_[num_buffers_];
-    T*                   context_decoder_output_buf_clone_[num_buffers_] = {nullptr};
-#endif
-
+    size_t                 head_num_;
+    size_t                 size_per_head_;
+    size_t                 inter_size_;
+    size_t                 num_layer_;
+    size_t                 vocab_size_;
+    size_t                 rotary_embedding_dim_;
+    size_t                 random_seed_;
+    size_t                 max_seq_len_;
+    size_t                 hidden_units_;
+    size_t                 rank_;
+    size_t                 world_size_;
     static constexpr float layernorm_eps_ = 1e-6f;
-
-    size_t hidden_units_;
-
-    NcclParam tensor_para_;
-    NcclParam pipeline_para_;
-
-    AttentionType attention_type_;
-
-    const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
+    AttentionType          attention_type_;
+    const bool             is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
                                            || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
     LLaMAContextDecoder<T>* llama_context_decoder_;
@@ -94,23 +78,8 @@ class LLaMA: public BaseLayer {
           size_t           rotary_embedding_dim,
           size_t           random_seed,
           size_t           max_seq_len,
-          cudaStream_t     stream,
-          cublasMMWrapper* cublas_wrapper,
-          IAllocator*      allocator,
-          bool             is_free_buffer_after_forward,
-          cudaDeviceProp*  cuda_device_prop = nullptr,
-          AttentionType    attention_type   = AttentionType::UNFUSED_MHA);
-
-    LLaMA(size_t           head_num,
-          size_t           size_per_head,
-          size_t           inter_size,
-          size_t           num_layer,
-          size_t           vocab_size,
-          size_t           rotary_embedding_dim,
-          size_t           random_seed,
-          size_t           max_seq_len,
-          NcclParam        tensor_para,
-          NcclParam        pipeline_para,
+          size_t           rank,
+          size_t           world_size,
           cudaStream_t     stream,
           cublasMMWrapper* cublas_wrapper,
           IAllocator*      allocator,
@@ -130,11 +99,6 @@ class LLaMA: public BaseLayer {
                  const std::unordered_map<std::string, Tensor>* input_tensors,
                  const LLaMAWeight<T>*                          llama_weights);
 
-    size_t getPipelineParallelRank();
-    size_t getPipelineParallelSize();
-    size_t getTensorParallelRank();
-    size_t getTensorParallelSize();
-    bool*  getFinishBuffer();
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 2709b2164..1d5901c61 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -15,10 +15,7 @@
  */
 
 #include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
-#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
 #include "src/fastertransformer/kernels/llama_kernels.h"
-
 #include "src/fastertransformer/layers/FfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
 #include "src/fastertransformer/utils/llama_utils.h"
@@ -86,30 +83,29 @@ void LLaMAContextDecoder<T>::freeBuffer()
 template<typename T>
 bool LLaMAContextDecoder<T>::isValidLayerParallelId(uint l)
 {
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
-           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l >= local_num_layer * rank_) && (l < local_num_layer * (rank_ + 1));
 }
 
 template<typename T>
 bool LLaMAContextDecoder<T>::isFirstLayerParallelId(uint l)
 {
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l == local_num_layer * rank_);
 }
 
 template<typename T>
 bool LLaMAContextDecoder<T>::isLastLayerParallelId(uint l)
 {
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l == local_num_layer * (rank_ + 1) - 1);
 }
 
 template<typename T>
 int LLaMAContextDecoder<T>::getFirstLayerParallelId()
 {
-    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
-    return local_num_layer * pipeline_para_.rank_;
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return local_num_layer * rank_;
 }
 
 template<typename T>
@@ -119,7 +115,8 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
                                             size_t           num_layer,
                                             size_t           rotary_embedding_dim,
                                             float            layernorm_eps,
-                                            NcclParam        pipeline_para,
+                                            size_t           rank,
+                                            size_t           world_size,
                                             cudaStream_t     stream,
                                             cublasMMWrapper* cublas_wrapper,
                                             IAllocator*      allocator,
@@ -134,7 +131,8 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
     rotary_embedding_dim_(rotary_embedding_dim),
     layernorm_eps_(layernorm_eps),
     hidden_units_(head_num * size_per_head),
-    pipeline_para_(pipeline_para),
+    rank_(rank),
+    world_size_(world_size),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type)
 {
@@ -151,7 +149,8 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
     rotary_embedding_dim_(decoder.rotary_embedding_dim_),
     layernorm_eps_(decoder.layernorm_eps_),
     hidden_units_(decoder.hidden_units_),
-    pipeline_para_(decoder.pipeline_para_),
+    rank_(decoder.rank_),
+    world_size_(decoder.world_size_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_)
 {
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index 94eb82a37..94d960dab 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -42,14 +42,10 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t num_layer_;
     size_t rotary_embedding_dim_;
     float  layernorm_eps_;
-
-    // calculated data
     size_t hidden_units_;
-
-    NcclParam pipeline_para_;
-
+    size_t rank_;
+    size_t world_size_;
     AttentionType attention_type_;
-
     bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
@@ -78,7 +74,8 @@ class LLaMAContextDecoder: public BaseLayer {
                         size_t           num_layer,
                         size_t           rotary_embedding_dim,
                         float            layernorm_eps,
-                        NcclParam        pipeline_para,
+                        size_t           rank,
+                        size_t           world_size,
                         cudaStream_t     stream,
                         cublasMMWrapper* cublas_wrapper,
                         IAllocator*      allocator,
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index 580b05d5c..ef9cca14f 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -27,8 +27,8 @@ LLaMA::LLaMA(const int64_t            num_heads,
              const int64_t            rotary_embedding_dim,
              const int64_t            random_seed,
              const int64_t            max_seq_len,
-             const int64_t            tensor_para_size,
-             const int64_t            pipeline_para_size,
+             const int64_t            rank,
+             const int64_t            world_size,
              const vector<th::Tensor> weights):
     vocab_size_(vocab_size), st_(weights[0].scalar_type())
 {
@@ -46,8 +46,8 @@ LLaMA::LLaMA(const int64_t            num_heads,
                                          (size_t)rotary_embedding_dim,
                                          (size_t)random_seed,
                                          (size_t)max_seq_len,
-                                         tensor_para_size,
-                                         pipeline_para_size,
+                                         (size_t)rank,
+                                         (size_t)world_size,
                                          weights);
             break;
         case at::ScalarType::Half:
@@ -59,8 +59,8 @@ LLaMA::LLaMA(const int64_t            num_heads,
                                         (size_t)rotary_embedding_dim,
                                         (size_t)random_seed,
                                         (size_t)max_seq_len,
-                                        tensor_para_size,
-                                        pipeline_para_size,
+                                        (size_t)rank,
+                                        (size_t)world_size,
                                         weights);
             break;
         default:
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index 029780e7f..db8b8aeec 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -52,8 +52,8 @@ class FTLLaMA: public IFLLaMA {
             const size_t             rotary_embedding_dim,
             const size_t             random_seed,
             const size_t             max_seq_len,
-            const int64_t            tensor_para_size,
-            const int64_t            pipeline_para_size,
+            const int64_t            rank,
+            const int64_t            world_size,
             const vector<th::Tensor> weights):
         num_heads_(num_heads),
         size_per_head_(size_per_head),
@@ -63,8 +63,8 @@ class FTLLaMA: public IFLLaMA {
         rotary_embedding_dim_(rotary_embedding_dim),
         random_seed_(random_seed),
         max_seq_len_(max_seq_len),
-        tensor_para_size_(tensor_para_size),
-        pipeline_para_size_(pipeline_para_size),
+        rank_(rank),
+        world_size_(world_size),
         weights_(weights)
     {
         ft::Logger::getLogger().setLevel(ft::Logger::WARNING);
@@ -73,8 +73,6 @@ class FTLLaMA: public IFLLaMA {
         cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
         cublas_wrapper_mutex_ = new std::mutex();
 
-        ftNcclInitialize(tensor_para_, pipeline_para_, tensor_para_size, pipeline_para_size);
-
         llama_weights_.resizeLayer(num_layers_);
         for (int i = 0; i < (int)num_layers_; i++) {
             llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
@@ -113,7 +111,6 @@ class FTLLaMA: public IFLLaMA {
         llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
 
         ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
-        // ft::check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
         ft::check_cuda_error(cudaStreamCreate(&stream_));
 
         for (int i = 0; i < num_events_; ++i) {
@@ -150,8 +147,8 @@ class FTLLaMA: public IFLLaMA {
                                   rotary_embedding_dim_,
                                   random_seed_,
                                   max_seq_len_,
-                                  tensor_para_,
-                                  pipeline_para_,
+                                  rank_,
+                                  world_size_,
                                   stream_,
                                   cublas_wrapper_,
                                   allocator_,
@@ -172,8 +169,6 @@ class FTLLaMA: public IFLLaMA {
         delete cublas_wrapper_;
         delete allocator_;
 
-        ft::ftNcclParamDestroy(tensor_para_);
-        ft::ftNcclParamDestroy(pipeline_para_);
         cublasLtDestroy(cublasltHandle_);
         delete cublas_algo_map_;
         delete cublas_wrapper_mutex_;
@@ -256,8 +251,8 @@ class FTLLaMA: public IFLLaMA {
     const size_t rotary_embedding_dim_;
     const size_t random_seed_;
     const size_t max_seq_len_;
-    int64_t      tensor_para_size_;
-    int64_t      pipeline_para_size_;
+    const size_t rank_;
+    const size_t world_size_;
 
     static constexpr int num_events_ = 5;
     int                  ev_no_      = 0;
@@ -271,9 +266,6 @@ class FTLLaMA: public IFLLaMA {
     struct cudaDeviceProp   prop_;
     ft::LLaMAWeight<T>      llama_weights_;
 
-    ft::NcclParam tensor_para_;
-    ft::NcclParam pipeline_para_;
-
     ft::cublasMMWrapper* cublas_wrapper_;
     ft::IAllocator*      allocator_;
     ft::LLaMA<T>*        llama_ = nullptr;
@@ -289,8 +281,8 @@ class LLaMA: public th::jit::CustomClassHolder {
           const int64_t            rotary_embedding_dim,
           const int64_t            random_seed,
           const int64_t            max_seq_len,
-          const int64_t            tensor_para_size,
-          const int64_t            pipeline_para_size,
+          const int64_t            rank,
+          const int64_t            world_size,
           const vector<th::Tensor> weights);
 
     ~LLaMA();

From ce8c72a3040c2ec075cb34d98357c640ab1ef00e Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 1 Oct 2023 07:04:48 +0000
Subject: [PATCH 53/55] add mpi_cxx

---
 CMakeLists.txt                             | 2 +-
 src/fastertransformer/utils/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 870e67f0a..0d879611a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -418,7 +418,7 @@ add_library(transformer-shared SHARED
 
 if (BUILD_MULTI_GPU)
 target_link_libraries(transformer-shared PUBLIC
-  -lmpi
+  -lmpi -lmpi_cxx
   ${NCCL_LIBRARIES}
 )
 endif()
diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt
index 9796ad076..22f735c27 100644
--- a/src/fastertransformer/utils/CMakeLists.txt
+++ b/src/fastertransformer/utils/CMakeLists.txt
@@ -57,7 +57,7 @@ add_library(mpi_utils STATIC mpi_utils.cc)
 set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 if (BUILD_MULTI_GPU)
-    target_link_libraries(mpi_utils PUBLIC -lmpi logger)
+    target_link_libraries(mpi_utils PUBLIC -lmpi -lmpi_cxx logger)
 endif()
 
 add_library(nccl_utils STATIC nccl_utils.cc)

From 48b35f76924cf89a7ec61d26462f0444f8a2f1f2 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Sun, 1 Oct 2023 10:25:54 +0000
Subject: [PATCH 54/55] ref

---
 .../LLaMAContextAttentionLayer.cc             | 159 ++++++------------
 .../LLaMAContextAttentionLayer.h              |   8 +-
 src/fastertransformer/models/llama/LLaMA.cc   |  11 +-
 src/fastertransformer/models/llama/LLaMA.h    |   2 -
 .../models/llama/LLaMAContextDecoder.cc       |   6 +-
 .../models/llama/LLaMAContextDecoder.h        |   2 -
 src/fastertransformer/th_op/llama/LLaMA.cc    |   4 +-
 src/fastertransformer/th_op/llama/LLaMA.h     |   8 -
 8 files changed, 60 insertions(+), 140 deletions(-)

diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index daf3d9178..209fed1ca 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -64,7 +64,7 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
                        "LLaMA Context FUSED_PADDED_MHA is not supported !");
 
     PUSH_RANGE("attention buffer alloc");
-    allocateBuffer(batch_size, seq_len, attn_len, max_seq_len, attention_type != AttentionType::FUSED_MHA);
+    allocateBuffer(batch_size, seq_len, attn_len);
     POP_RANGE;
     sync_check_cuda_error();
 
@@ -138,86 +138,51 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
     const int            attention_seq_len_1 = seq_len;   // q length
     const int            attention_seq_len_2 = attn_len;  // kv length
     const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
+    FT_CHECK(gemm_data_type != CUDA_R_32F);
 
     //
     // softmax(Q*K^T)
     //
-    if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) {
-        PUSH_RANGE("Q*K batch gemm");
-
-        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            attention_seq_len_2,  // n
-                                            attention_seq_len_1,  // m
-                                            size_per_head_,       // k
-                                            1.0f,
-                                            k_buf_2_,
-                                            gemm_data_type,
-                                            size_per_head_,                        // k
-                                            attention_seq_len_2 * size_per_head_,  // n * k
-                                            q_buf_2_,
-                                            gemm_data_type,
-                                            size_per_head_,                        // k
-                                            attention_seq_len_1 * size_per_head_,  // m * k
-                                            0.0f,
-                                            qk_buf_float_,
-                                            CUDA_R_32F,
-                                            attention_seq_len_2,  // n
-                                            attention_seq_len_2 * attention_seq_len_1,
-                                            batch_size * head_num_,  // global batch size
-                                            CUDA_R_32F);
-        sync_check_cuda_error();
-        POP_RANGE;
-
-        PUSH_RANGE("softmax");
-        MaskedSoftmaxParam<T, float> param;
-        param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-        param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
-        param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-        param.batch_size         = batch_size;
-        param.q_length           = attention_seq_len_1;
-        param.k_length           = attention_seq_len_2;
-        param.num_heads          = head_num_;
-        param.qk_scale           = qk_scale;
-        param.linear_bias_slopes = nullptr;
-        invokeMaskedSoftmax(param, stream_);
-        sync_check_cuda_error();
-        POP_RANGE;
-    }
-    else {
-        PUSH_RANGE("Q*K batch gemm");
-        cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            attention_seq_len_2,
-                                            attention_seq_len_1,
-                                            size_per_head_,
-                                            k_buf_2_,
-                                            size_per_head_,
-                                            attention_seq_len_2 * size_per_head_,
-                                            q_buf_2_,
-                                            size_per_head_,
-                                            attention_seq_len_1 * size_per_head_,
-                                            qk_buf_,
-                                            attention_seq_len_2,
-                                            attention_seq_len_2 * attention_seq_len_1,
-                                            batch_size * head_num_);
-
-        POP_RANGE;
-        PUSH_RANGE("softmax");
-        MaskedSoftmaxParam<T, T> param;
-        param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-        param.qk                 = qk_buf_;         // (batch_size, head_num, q_length, k_length)
-        param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
-        param.batch_size         = batch_size;
-        param.q_length           = attention_seq_len_1;
-        param.k_length           = attention_seq_len_2;
-        param.num_heads          = head_num_;
-        param.qk_scale           = qk_scale;
-        param.linear_bias_slopes = nullptr;
-        invokeMaskedSoftmax(param, stream_);
-        sync_check_cuda_error();
-        POP_RANGE;
-    }
+    PUSH_RANGE("Q*K batch gemm");
+
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        attention_seq_len_2,  // n
+                                        attention_seq_len_1,  // m
+                                        size_per_head_,       // k
+                                        1.0f,
+                                        k_buf_2_,
+                                        gemm_data_type,
+                                        size_per_head_,                        // k
+                                        attention_seq_len_2 * size_per_head_,  // n * k
+                                        q_buf_2_,
+                                        gemm_data_type,
+                                        size_per_head_,                        // k
+                                        attention_seq_len_1 * size_per_head_,  // m * k
+                                        0.0f,
+                                        qk_buf_float_,
+                                        CUDA_R_32F,
+                                        attention_seq_len_2,  // n
+                                        attention_seq_len_2 * attention_seq_len_1,
+                                        batch_size * head_num_,  // global batch size
+                                        CUDA_R_32F);
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    PUSH_RANGE("softmax");
+    MaskedSoftmaxParam<T, float> param;
+    param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+    param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
+    param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+    param.batch_size         = batch_size;
+    param.q_length           = attention_seq_len_1;
+    param.k_length           = attention_seq_len_2;
+    param.num_heads          = head_num_;
+    param.qk_scale           = qk_scale;
+    param.linear_bias_slopes = nullptr;
+    invokeMaskedSoftmax(param, stream_);
+    sync_check_cuda_error();
+    POP_RANGE;
 
     PUSH_RANGE("QK*V batch gemm");
     cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
@@ -301,14 +266,12 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_
                                                           cudaStream_t     stream,
                                                           cublasMMWrapper* cublas_wrapper,
                                                           IAllocator*      allocator,
-                                                          bool             is_free_buffer_after_forward,
-                                                          bool             is_qk_buf_float):
+                                                          bool             is_free_buffer_after_forward):
     BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
     head_num_(head_num),
     size_per_head_(size_per_head),
     hidden_units_(head_num * size_per_head),
-    rotary_embedding_dim_(rotary_embedding_dim),
-    is_qk_buf_float_(is_qk_buf_float)
+    rotary_embedding_dim_(rotary_embedding_dim)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
 }
@@ -322,8 +285,7 @@ LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionL
     head_num_(attention_layer.head_num_),
     size_per_head_(attention_layer.size_per_head_),
     hidden_units_(attention_layer.hidden_units_),
-    rotary_embedding_dim_(attention_layer.rotary_embedding_dim_),
-    is_qk_buf_float_(attention_layer.is_qk_buf_float_)
+    rotary_embedding_dim_(attention_layer.rotary_embedding_dim_)
 {
 }
 
@@ -341,36 +303,22 @@ void LLaMAContextAttentionLayer<T>::allocateBuffer()
 }
 
 template<typename T>
-void LLaMAContextAttentionLayer<T>::allocateBuffer(
-    size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf)
+void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len)
 {
     FT_LOG_DEBUG(__PRETTY_FUNCTION__);
     qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
-    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false);
+    q_buf_2_ =
+        (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_;
 
     // save memory usage when using fmha
-    if (allocate_qk_buf) {
-        qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false);
-    }
-    else {
-        allocator_->free((void**)(&qk_buf_));
-        qk_buf_ = nullptr;
-    }
+    qk_buf_    = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false);
     qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
     qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
 
-    if (is_qk_buf_float_ == true) {
-        if (allocate_qk_buf) {
-            qk_buf_float_ = (float*)allocator_->reMalloc(
-                qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false);
-        }
-        else {
-            allocator_->free((void**)(&qk_buf_float_));
-            qk_buf_float_ = nullptr;
-        }
-    }
+    qk_buf_float_ =
+        (float*)allocator_->reMalloc(qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false);
 
     is_allocate_buffer_ = true;
 }
@@ -387,10 +335,7 @@ void LLaMAContextAttentionLayer<T>::freeBuffer()
         allocator_->free((void**)(&qk_buf_));
         allocator_->free((void**)(&qkv_buf_2_));
         allocator_->free((void**)(&qkv_buf_3_));
-
-        if (is_qk_buf_float_ == true) {
-            allocator_->free((void**)(&qk_buf_float_));
-        }
+        allocator_->free((void**)(&qk_buf_float_));
 
         is_allocate_buffer_ = false;
     }
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
index 8d24689a8..504cc8aba 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -34,17 +34,14 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
     const size_t rotary_embedding_dim_;
 
     // fmha runner
-    int  sm_ = getSMVersion();
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, size_t max_seq_len, bool allocate_qk_buf);
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len);
     void freeBuffer() override;
 
     using BaseAttentionLayer<T>::is_free_buffer_after_forward_;
     using BaseAttentionLayer<T>::is_allocate_buffer_;
     using BaseAttentionLayer<T>::cublas_wrapper_;
 
-    bool is_qk_buf_float_;
-
 protected:
     using BaseAttentionLayer<T>::allocator_;
     using BaseAttentionLayer<T>::stream_;
@@ -65,8 +62,7 @@ class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
                                cudaStream_t     stream,
                                cublasMMWrapper* cublas_wrapper,
                                IAllocator*      allocator,
-                               bool             is_free_buffer_after_forward,
-                               bool             is_qk_buf_float);
+                               bool             is_free_buffer_after_forward);
 
     LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer);
 
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 3884a79ac..052a9b2e2 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -39,7 +39,6 @@ void LLaMA<T>::initialize()
                                                         cublas_wrapper_,
                                                         allocator_,
                                                         is_free_buffer_after_forward_,
-                                                        is_context_qk_buf_float_,
                                                         attention_type_);
 }
 
@@ -188,7 +187,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
 
     // output_tensors:
     //      hidden_vector [num_tokens, hidden_size]
-    //      log_probs [num_tokens, vocab_size]
     //      cum_probs [beam_width, batch_size]
 
     FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7");
@@ -211,7 +209,6 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
     const int  attn_len        = input_tensors->at("attn_len").getVal<int>();
     const int  is_context      = input_tensors->at("is_context").getVal<int>();
     T*         hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
-    float*     log_probs       = output_tensors->at("log_probs").getPtr<float>();
     float*     cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
 
     FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len");
@@ -311,11 +308,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
         cublas_wrapper_->setFP16GemmConfig();
 
-        invokeLLaMALogSoftmax(log_probs, logits_buf_, batch_size, vocab_size_, stream_);
+        invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, batch_size, vocab_size_, stream_);
         sync_check_cuda_error();
 
         invokeLLaMAExtractTargets(
-            cum_probs, log_probs, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_);
+            cum_probs, log_likelihood_buf_, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_);
         sync_check_cuda_error();
     }
     else {
@@ -345,11 +342,11 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         sync_check_cuda_error();
         cublas_wrapper_->setFP16GemmConfig();
 
-        invokeLLaMALogSoftmax(log_probs, logits_buf_, num_tokens, vocab_size_, stream_);
+        invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, num_tokens, vocab_size_, stream_);
         sync_check_cuda_error();
 
         invokeLLaMAGatherTokens(
-            cum_probs, log_probs, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_);
+            cum_probs, log_likelihood_buf_, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_);
         sync_check_cuda_error();
     }
 }
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
index 7be6120ab..117f87341 100644
--- a/src/fastertransformer/models/llama/LLaMA.h
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -42,8 +42,6 @@ class LLaMA: public BaseLayer {
     size_t                 world_size_;
     static constexpr float layernorm_eps_ = 1e-6f;
     AttentionType          attention_type_;
-    const bool             is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr
-                                           || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
     LLaMAContextDecoder<T>* llama_context_decoder_;
 
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 1d5901c61..036921135 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -32,8 +32,7 @@ void LLaMAContextDecoder<T>::initialize()
                                                               stream_,
                                                               cublas_wrapper_,
                                                               allocator_,
-                                                              is_free_buffer_after_forward_,
-                                                              is_qk_buf_float_);
+                                                              is_free_buffer_after_forward_);
 
     ffn_layer_ = new SiluFfnLayer<T>(0,  // max_batch_size
                                      0,  // max_seq_len
@@ -121,7 +120,6 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
                                             cublasMMWrapper* cublas_wrapper,
                                             IAllocator*      allocator,
                                             bool             is_free_buffer_after_forward,
-                                            bool             is_qk_buf_float,
                                             AttentionType    attention_type):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
     head_num_(head_num),
@@ -133,7 +131,6 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
     hidden_units_(head_num * size_per_head),
     rank_(rank),
     world_size_(world_size),
-    is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type)
 {
     initialize();
@@ -151,7 +148,6 @@ LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decode
     hidden_units_(decoder.hidden_units_),
     rank_(decoder.rank_),
     world_size_(decoder.world_size_),
-    is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_)
 {
     initialize();
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
index 94d960dab..3e2aeb0c0 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.h
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -46,7 +46,6 @@ class LLaMAContextDecoder: public BaseLayer {
     size_t rank_;
     size_t world_size_;
     AttentionType attention_type_;
-    bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
@@ -80,7 +79,6 @@ class LLaMAContextDecoder: public BaseLayer {
                         cublasMMWrapper* cublas_wrapper,
                         IAllocator*      allocator,
                         bool             is_free_buffer_after_forward,
-                        bool             is_qk_buf_float,
                         AttentionType    attention_type = AttentionType::FUSED_MHA);
 
     LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
index ef9cca14f..760ead92e 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.cc
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -74,7 +74,6 @@ LLaMA::~LLaMA()
 }
 
 std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
-                                       th::Tensor&   log_probs,
                                        th::Tensor&   cum_probs,
                                        th::Tensor&   input_ids,
                                        th::Tensor&   input_lengths,
@@ -92,7 +91,6 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
     TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
 
     ftllama->forward(hidden_vector,
-                     log_probs,
                      cum_probs,
                      input_ids,
                      input_lengths,
@@ -101,7 +99,7 @@ std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
                      seq_len,
                      attn_len,
                      is_context);
-    return std::vector<th::Tensor>{hidden_vector, log_probs, cum_probs};
+    return std::vector<th::Tensor>{hidden_vector, cum_probs};
 }
 
 }  // namespace torch_ext
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
index db8b8aeec..425f260df 100755
--- a/src/fastertransformer/th_op/llama/LLaMA.h
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -30,7 +30,6 @@ class IFLLaMA {
 public:
     virtual ~IFLLaMA() {}
     virtual void forward(th::Tensor& hidden_vector,
-                         th::Tensor& log_probs,
                          th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
@@ -175,7 +174,6 @@ class FTLLaMA: public IFLLaMA {
     }
 
     virtual void forward(th::Tensor& hidden_vector,
-                         th::Tensor& log_probs,
                          th::Tensor& cum_probs,
                          th::Tensor& input_ids,
                          th::Tensor& input_lengths,
@@ -212,11 +210,6 @@ class FTLLaMA: public IFLLaMA {
                         (std::is_same<T, half>::value) ? ft::TYPE_FP16 : ft::TYPE_FP32,
                         std::vector<size_t>{num_tokens, num_heads_ * size_per_head_},
                         get_ptr<T>(hidden_vector)}},
-            {"log_probs",
-             ft::Tensor{ft::MEMORY_GPU,
-                        ft::TYPE_FP32,
-                        std::vector<size_t>{num_tokens, vocab_size_},
-                        get_ptr<float>(log_probs)}},
             {"cum_probs",
              ft::Tensor{ft::MEMORY_GPU,
                         ft::TYPE_FP32,
@@ -288,7 +281,6 @@ class LLaMA: public th::jit::CustomClassHolder {
     ~LLaMA();
 
     std::vector<th::Tensor> forward(th::Tensor&   hidden_vector,
-                                    th::Tensor&   log_probs,
                                     th::Tensor&   cum_probs,
                                     th::Tensor&   input_ids,
                                     th::Tensor&   input_lengths,

From baca61bafecd25a11f8099bf3cbf778caaaac245 Mon Sep 17 00:00:00 2001
From: dypshong <dypshong@gmail.com>
Date: Mon, 2 Oct 2023 03:14:12 +0000
Subject: [PATCH 55/55] final

---
 .../kernels/llama_kernels.cu                  | 39 +++++++++++++++++--
 src/fastertransformer/kernels/llama_kernels.h |  1 +
 .../LLaMAContextAttentionLayer.cc             |  4 +-
 src/fastertransformer/models/llama/LLaMA.cc   |  4 +-
 .../models/llama/LLaMAContextDecoder.cc       |  4 ++
 5 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
index d6d119227..4b02602d9 100644
--- a/src/fastertransformer/kernels/llama_kernels.cu
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -41,9 +41,19 @@ template void invokeLLaMAGetLastTokens(
     float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
 template void invokeLLaMAGetLastTokens(
     half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
-
-__global__ void LLaMA_extract_targets(
-    float* out, float* in, const int* target_ids, const int* cu_seqlens, int beam_width, int batch_size, int vocab_size, int num_tokens)
+#ifdef ENABLE_BF16
+template void invokeLLaMAGetLastTokens(
+    __nv_bfloat16* out, __nv_bfloat16* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+#endif
+
+__global__ void LLaMA_extract_targets(float*     out,
+                                      float*     in,
+                                      const int* target_ids,
+                                      const int* cu_seqlens,
+                                      int        beam_width,
+                                      int        batch_size,
+                                      int        vocab_size,
+                                      int        num_tokens)
 {
     // in [batch_size, vocab_size]
     // target_ids [ beam_width, num_tokens ]
@@ -211,6 +221,14 @@ template void invokeLLaMAInputIdsEmbeddingLookup(half*        out,
                                                  const int    num_tokens,
                                                  const int    hidden_units,
                                                  cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAInputIdsEmbeddingLookup(__nv_bfloat16*       out,
+                                                 const __nv_bfloat16* embedding_table,
+                                                 const int*           input_ids,
+                                                 const int            num_tokens,
+                                                 const int            hidden_units,
+                                                 cudaStream_t         stream);
+#endif
 
 __global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel(
     int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len)
@@ -299,6 +317,15 @@ template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
                                                    const int    seq_len,
                                                    const int    attn_len,
                                                    cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMABuildDecoderAttentionMask(__nv_bfloat16* attention_mask,
+                                                   const int*     sequence_length,
+                                                   const int*     context_lengths,
+                                                   const int      batch_size,
+                                                   const int      seq_len,
+                                                   const int      attn_len,
+                                                   cudaStream_t   stream);
+#endif
 
 template<typename T>
 __global__ void LLaMACopyKernel(T* dst, T* src, const int count)
@@ -326,6 +353,9 @@ void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream)
 
 template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream);
 template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMACopyKernel(__nv_bfloat16* dst, __nv_bfloat16* src, const int count, cudaStream_t stream);
+#endif
 
 template<typename T>
 __global__ void LLaMAMemset0Kernel(T* dst, const int count)
@@ -352,5 +382,8 @@ void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream)
 
 template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream);
 template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAMemset0(__nv_bfloat16* dst, const int count, cudaStream_t stream);
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
index f0d356a09..01e3bbf7a 100644
--- a/src/fastertransformer/kernels/llama_kernels.h
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -29,6 +29,7 @@ void invokeLLaMAInputIdsEmbeddingLookup(T*           from_tensor,
 
 template<typename T>
 void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
+
 template<typename T>
 void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream);
 
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
index 209fed1ca..28c0f6f55 100644
--- a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -18,6 +18,7 @@
 #include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
 #include "src/fastertransformer/kernels/layernorm_kernels.h"
 #include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
 #include "src/fastertransformer/utils/llama_utils.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 
@@ -87,7 +88,8 @@ void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_ten
 
     if (padding_offset != nullptr) {
         // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
-        cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_);
+        //cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_);
+        invokeLLaMAMemset0(q_buf_2_, batch_size * (seq_len + 2 * attn_len) * hidden_units_, stream_);
         sync_check_cuda_error();
     }
 
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
index 052a9b2e2..1cc8a95c4 100644
--- a/src/fastertransformer/models/llama/LLaMA.cc
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -248,7 +248,9 @@ void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
         {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}},
         {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}},
-        {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}};
+        {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}},
+        {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}},
+    };
 
     if (is_unpadded_mha) {
         decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}});
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
index 036921135..7406c838f 100644
--- a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -189,6 +189,8 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     //      input_lengths [batch_size]
     //      context_lengths [batch_size]
     //      seq_len [1] int on cpu
+    //      attn_len [1] int on cpu
+    //      is_context [1] int on cpu
     //      padding_offset [batch_size] int on cpu
     //      cu_seqlens [batch_size+1] int on cpu
 
@@ -208,6 +210,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
     const int  seq_len         = input_tensors->at("attention_mask").shape[2];
     const int  attn_len        = input_tensors->at("attention_mask").shape[3];
+    const int  is_context      = input_tensors->at("is_context").getVal<int>();
     const int* padding_offset  = nullptr;
     const int* cu_seqlens      = nullptr;
     if (is_unpadded_mha) {
@@ -267,6 +270,7 @@ void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                     attention_mask}},
             {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}},
             {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
+            {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}},
         };
 
         if (is_unpadded_mha) {