AI-Hypercomputer
diff --git a/‎experimental/jax/README.md‎
Lines changed: 104 additions & 0 deletions b/‎experimental/jax/README.md‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎experimental/jax/inference/entrypoint/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎experimental/jax/inference/entrypoint/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎experimental/jax/inference/entrypoint/mini_offline_benchmarking.py‎
Lines changed: 75 additions & 0 deletions b/‎experimental/jax/inference/entrypoint/mini_offline_benchmarking.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎experimental/jax/inference/entrypoint/open_orca_gpt4_tokenized_llama.calibration_1000.pkl‎
3.54 MB b/‎experimental/jax/inference/entrypoint/open_orca_gpt4_tokenized_llama.calibration_1000.pkl‎
3.54 MB
diff --git a/‎experimental/jax/inference/entrypoint/run_simple_server.py‎
Lines changed: 31 additions & 0 deletions b/‎experimental/jax/inference/entrypoint/run_simple_server.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎experimental/jax/inference/kernel/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎experimental/jax/inference/kernel/__init__.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎experimental/jax/inference/kernel/attention/tpu/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎experimental/jax/inference/kernel/attention/tpu/__init__.py‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,104 @@
+# An experimental JAX inference framework for prototyping new ideas.
+
+## About
+
+ It has the following features (some of them are limited version):
+
+```
+  Performance:
+    1. Paged Attention
+    2. Chunked Prefill and Piggybacking Decode
+    3. Collective Matmul
+
+  Framework:
+    1. Pythonic model builder
+    2. JAX manual sharding
+    3. Interface for different hardware supports
+    4. On-the-flying HF model conversion and deployment
+```
+
+## Quick Start
+
+So far, the experimental code only works for llama2 7b and TPU v5e-8. The whole process only takes less than 10 mins if you have a Cloud TPU v5e-8 ready.
+
+### 1. Create Cloud TPU v5e-8 on Google Cloud:
+
+```
+gcloud alpha compute tpus queued-resources create ${QR_NAME} \
+    --node-id ${NODE_NAME} \
+    --project ${PROJECT_ID} \
+    --zone ${ZONE} \
+    --accelerator-type v5litepod-8 \
+    --runtime-version v2-alpha-tpuv5-lite 
+```
+
+For more [information](https://cloud.google.com/tpu/docs/queued-resources)
+
+
+### 2. Set up the LLM Server and serve request:
+SSH into your Cloud TPU VM first and run the following command:
+
+Set up a new Python env.
+```
+virtualenv jax-inference
+source jax-inference/bin/activate
+```
+
+Clone the repo and install the dependencies.
+```
+git clone https://github.com/AI-Hypercomputer/JetStream.git
+
+cd JetStream/experimental/jax
+
+pip install -r requirements.txt
+```
+
+Log in to the Hugging Face (make sure your account has the permission to access `meta-llama/Llama-2-7b-chat-hf`)
+
+```
+huggingface-cli login
+```
+
+
+### 3. Offline Benchmarking:
+
+Note: the current setup is using 8-ways TP which is just for experiment and compare with current JetStream + MaxText number.
+
+```
+export PYTHONPATH=$(pwd)
+export JAX_COMPILATION_CACHE_DIR="/tmp/jax_cache"
+python inference/entrypoint/mini_offline_benchmarking.py
+```
+
+Offline Benchmarking result:
+
+This number is around `45%` better than the current MaxText and JetStream (as of 2024/08/16) number in the same situation.
+
+
+```
+Benchmarking result:
+  Total requests: 1000
+  Total input tokens: 218743
+  Total output tokens: 291740
+  Input token throughput: 2980.654636529649 tokens/sec
+  Output token throughput: 3975.332621666338 tokens/sec
+```
+
+Note: The online number should be even more better than the current MaxText and JetStream as the experimental framework runs the prefill and decode together in one model forward pass.
+
+### 4. Online Serving Example:
+
+Start server:
+
+```
+python inference/entrypoint/run_simple_server.py &
+```
+
+Send request:
+
+```
+curl --no-buffer -H 'Content-Type: application/json' \
+  -d '{ "prompt": "Today is a good day" }' \
+  -X POST \
+  localhost:8000/generate
+```
@@ -0,0 +1,15 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
@@ -0,0 +1,75 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import time
+import pandas
+from inference.runtime.request_type import *
+from inference.runtime import offline_inference
+
+
+def load_openorca_dataset_pkl():
+  # Read pickle file
+  current_dir = os.path.dirname(__file__)
+  samples = pandas.read_pickle(
+      f"{current_dir}/open_orca_gpt4_tokenized_llama.calibration_1000.pkl"
+  )
+
+  prompts = []
+  outputs = []
+  for _, row in samples.iterrows():
+    prompts.append(row["input"])
+    outputs.append(row["output"])
+
+  return [(prompt, output) for prompt, output in zip(prompts, outputs)]
+
+
+def benchmarking():
+  dataset = load_openorca_dataset_pkl()
+
+  ds = dataset[:1000]
+  ds = [d[0] for d in ds]
+
+  inference_instance = offline_inference.OfflineInference()
+
+  start_time = time.perf_counter()
+  res_list: list[Response] = inference_instance(ds)
+  end_time = time.perf_counter()
+  duration = end_time - start_time
+
+  input_tokens = []
+  for res in res_list:
+    input_tokens = input_tokens + res.input_tokens
+
+  output_tokens = []
+  for res in res_list:
+    output_tokens = output_tokens + res.generated_tokens
+
+  num_input_tokens = len(input_tokens)
+  num_output_tokens = len(output_tokens)
+
+  print("Benchmarking result: ")
+  # Hardcode the number of requests as 1000 based on the test
+  # dataset.
+  print("  Total requests: 1000")
+  print("  Total input tokens:", num_input_tokens)
+  print("  Total output tokens:", num_output_tokens)
+  print(f"  Input token throughput: {num_input_tokens/duration} tokens/sec")
+  print(f"  Output token throughput: {num_output_tokens/duration} tokens/sec")
+
+
+if __name__ == "__main__":
+  benchmarking()
@@ -0,0 +1,31 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import uvicorn
+import os
+
+
+if __name__ == "__main__":
+  print("start")
+  current_dir = os.path.dirname(__file__)
+  parent_dir = os.path.dirname(current_dir)
+
+  uvicorn.run(
+      app_dir=f"{parent_dir}/server",
+      app="simple_server:app",
+      host="0.0.0.0",
+      port=8000,
+  )
@@ -0,0 +1,23 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .attention_ops import *
+from .attention.tpu.quantization_utils import *
+from .collective_matmul_ops import *
+from .linear.tpu.collective_matmul import (
+    prepare_rhs_for_all_gather_collective_matmul,
+    prepare_rhs_for_collective_matmul_reduce_scatter,
+)
@@ -0,0 +1,18 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .chunked_prefill_attention import *
+from .paged_attention import *