From c99e9f1184cb3eab824e5dfd030007767e0fd492 Mon Sep 17 00:00:00 2001
From: Nicola Dall'Asen <fodark@pm.me>
Date: Wed, 13 Mar 2024 12:26:26 +0100
Subject: [PATCH 1/5] automatically detect the Python version, if above 3.10
 patch collections to use collections.abc

---
 deepseek_vl/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/deepseek_vl/__init__.py b/deepseek_vl/__init__.py
index 8cb7640..8f00116 100644
--- a/deepseek_vl/__init__.py
+++ b/deepseek_vl/__init__.py
@@ -16,3 +16,14 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# check if python version is above 3.10
+import sys
+if sys.version_info >=(3, 10):
+    print("Python version is above 3.10, patching the collections module.")
+    # Monkey patch collections
+    import collections
+    import collections.abc
+    for type_name in collections.abc.__all__:
+        setattr(collections, type_name, getattr(collections.abc, type_name))

From 5db8156747f82d67ea3f70e771619c53ae85d795 Mon Sep 17 00:00:00 2001
From: Nicola Dall'Asen <fodark@pm.me>
Date: Wed, 13 Mar 2024 12:30:50 +0100
Subject: [PATCH 2/5] use transformers 4.38.1 instead of 4.38.2 as autocast
 fails on mps on latest version

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a391008..da88b54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
     "torch>=2.0.1",
-    "transformers>=4.38.2",
+    "transformers==4.38.1",
     "timm>=0.9.16",
     "gradio>=4.13.0",
     "accelerate",
diff --git a/requirements.txt b/requirements.txt
index 7a93ed8..7535a74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 torch>=2.0.1
-transformers>=4.38.2
+transformers==4.38.1
 timm>=0.9.16
 gradio>=4.13.0
 accelerate

From 494e622544b7aefd5634fdc72878a0289d4e548f Mon Sep 17 00:00:00 2001
From: Nicola Dall'Asen <fodark@pm.me>
Date: Wed, 13 Mar 2024 12:31:28 +0100
Subject: [PATCH 3/5] add an util function to detect platflorm and suitable
 dtype

---
 deepseek_vl/utils/io.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/deepseek_vl/utils/io.py b/deepseek_vl/utils/io.py
index 081f7a2..a160f00 100644
--- a/deepseek_vl/utils/io.py
+++ b/deepseek_vl/utils/io.py
@@ -27,6 +27,27 @@
 from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
 
 
+def get_device_and_dtype():
+    """
+    Get the device and dtype for the model.
+    """
+
+    if torch.cuda.is_available():
+        print("Using CUDA and BFloat16")
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+    elif torch.backends.mps.is_available():
+        print("Using MPS and FP16")
+        device = torch.device("mps")
+        dtype = torch.float16
+    else:
+        print("Using CPU and FP32")
+        device = torch.device("cpu")
+        dtype = torch.float32
+
+    return device, dtype
+
+
 def load_pretrained_model(model_path: str):
     vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
     tokenizer = vl_chat_processor.tokenizer

From 40ec6491d4b3c0667e427622fb185878d3e98127 Mon Sep 17 00:00:00 2001
From: Nicola Dall'Asen <fodark@pm.me>
Date: Wed, 13 Mar 2024 12:32:14 +0100
Subject: [PATCH 4/5] load model on detected device and use correct dtype

---
 cli_chat.py             | 6 +++---
 deepseek_vl/utils/io.py | 4 +++-
 inference.py            | 8 +++++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cli_chat.py b/cli_chat.py
index bbea14a..e34a88e 100644
--- a/cli_chat.py
+++ b/cli_chat.py
@@ -8,7 +8,7 @@
 import torch
 from transformers import TextIteratorStreamer
 
-from deepseek_vl.utils.io import load_pretrained_model
+from deepseek_vl.utils.io import load_pretrained_model, get_device_and_dtype
 
 
 def load_image(image_file):
@@ -34,13 +34,13 @@ def get_help_message(image_token):
 
 @torch.inference_mode()
 def response(args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config):
-
+    _, dtype = get_device_and_dtype()
     prompt = conv.get_prompt()
     prepare_inputs = vl_chat_processor.__call__(
         prompt=prompt,
         images=pil_images,
         force_batchify=True
-    ).to(vl_gpt.device)
+    ).to(vl_gpt.device, dtype=dtype)
 
     # run image encoder to get the image embeddings
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
diff --git a/deepseek_vl/utils/io.py b/deepseek_vl/utils/io.py
index a160f00..06c1a47 100644
--- a/deepseek_vl/utils/io.py
+++ b/deepseek_vl/utils/io.py
@@ -52,10 +52,12 @@ def load_pretrained_model(model_path: str):
     vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
     tokenizer = vl_chat_processor.tokenizer
 
+    device, dtype = get_device_and_dtype()
+
     vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
         model_path, trust_remote_code=True
     )
-    vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+    vl_gpt = vl_gpt.to(device, dtype=dtype).eval()
 
     return tokenizer, vl_chat_processor, vl_gpt
 
diff --git a/inference.py b/inference.py
index b6fe38c..b22a632 100644
--- a/inference.py
+++ b/inference.py
@@ -2,7 +2,7 @@
 from transformers import AutoModelForCausalLM
 
 from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
-from deepseek_vl.utils.io import load_pil_images
+from deepseek_vl.utils.io import load_pil_images, get_device_and_dtype
 
 
 # specify the path to the model
@@ -11,7 +11,9 @@
 tokenizer = vl_chat_processor.tokenizer
 
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
-vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+
+device, dtype = get_device_and_dtype()
+vl_gpt = vl_gpt.to(dtype).to(device).eval()
 
 conversation = [
     {
@@ -32,7 +34,7 @@
     conversations=conversation,
     images=pil_images,
     force_batchify=True
-).to(vl_gpt.device)
+).to(vl_gpt.device, dtype=dtype)
 
 # run image encoder to get the image embeddings
 inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

From 4eac5b59982e37dfdb8fe9c06e763235a09c64d3 Mon Sep 17 00:00:00 2001
From: Nicola Dall'Asen <fodark@pm.me>
Date: Wed, 13 Mar 2024 12:33:52 +0100
Subject: [PATCH 5/5] modify example in README to use the new device and dtype
 function

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6d24eaf..9bfaeeb 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,7 @@ import torch
 from transformers import AutoModelForCausalLM
 
 from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
-from deepseek_vl.utils.io import load_pil_images
+from deepseek_vl.utils.io import load_pil_images, get_device_and_dtype
 
 
 # specify the path to the model
@@ -121,7 +121,9 @@ vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
-vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+
+device, dtype = get_device_and_dtype()
+vl_gpt = vl_gpt.to(dtype).to(device).eval()
 
 conversation = [
     {