@@ -65,15 +65,15 @@ def start(self):
65
65
device_map = "cuda" ,
66
66
** self .pretrained_model_init_kwargs
67
67
)
68
-
68
+
69
69
# BGE-VL specific initialization
70
70
if self .is_bge_vl :
71
71
try :
72
72
self .model .set_processor (model_abs_path )
73
73
logger .info (f"BGE-VL processor set successfully for model: { self .model_id } " )
74
74
except Exception as e :
75
75
logger .warning (f"Failed to set BGE-VL processor: { e } " )
76
-
76
+
77
77
logger .info (f"model: { self .model } " )
78
78
# TODO add tokenizer init args from model's definition
79
79
# self.tokenizer = AutoTokenizer.from_pretrained(
@@ -106,20 +106,20 @@ def _process_base64_image(self, image_data: str) -> Image.Image:
106
106
# Handle data URL format
107
107
if image_data .startswith ('data:image' ):
108
108
image_data = image_data .split (',' )[1 ]
109
-
109
+
110
110
# Decode base64
111
111
image_bytes = base64 .b64decode (image_data )
112
112
image = Image .open (io .BytesIO (image_bytes ))
113
-
113
+
114
114
# Convert to RGB if needed
115
115
if image .mode != 'RGB' :
116
116
image = image .convert ('RGB' )
117
-
117
+
118
118
return image
119
119
except Exception as e :
120
120
logger .error (f"Failed to process base64 image: { e } " )
121
121
raise ValueError (f"Invalid image data: { e } " )
122
-
122
+
123
123
def _convert_pil_to_bytesio (self , pil_image : Image .Image ) -> io .BytesIO :
124
124
"""Convert PIL Image to BytesIO object for BGE-VL compatibility"""
125
125
try :
@@ -131,13 +131,13 @@ def _convert_pil_to_bytesio(self, pil_image: Image.Image) -> io.BytesIO:
131
131
except Exception as e :
132
132
logger .error (f"Failed to convert PIL image to BytesIO: { e } " )
133
133
raise ValueError (f"Image conversion failed: { e } " )
134
-
134
+
135
135
def _parse_multimodal_inputs (self , inputs ):
136
136
"""Parse and categorize multimodal inputs for BGE-VL"""
137
137
text_inputs = []
138
138
image_inputs = []
139
139
multimodal_inputs = []
140
-
140
+
141
141
for inp in inputs :
142
142
if isinstance (inp , str ):
143
143
# Simple text input
@@ -162,14 +162,14 @@ def _parse_multimodal_inputs(self, inputs):
162
162
# Convert PIL Image to BytesIO for BGE-VL compatibility
163
163
bytesio_image = self ._convert_pil_to_bytesio (pil_image )
164
164
multimodal_inputs .append ((text , bytesio_image ))
165
-
165
+
166
166
return text_inputs , image_inputs , multimodal_inputs
167
-
167
+
168
168
def _generate_bge_vl_embeddings (self , inputs ):
169
169
"""Generate embeddings using BGE-VL model"""
170
170
text_inputs , image_inputs , multimodal_inputs = self ._parse_multimodal_inputs (inputs )
171
171
all_embeddings = []
172
-
172
+
173
173
# Process text-only inputs
174
174
if text_inputs :
175
175
try :
@@ -182,7 +182,7 @@ def _generate_bge_vl_embeddings(self, inputs):
182
182
except Exception as e :
183
183
logger .error (f"Failed to encode text inputs: { e } " )
184
184
raise ValueError (f"BGE-VL text encoding failed: { e } " )
185
-
185
+
186
186
# Process image-only inputs
187
187
if image_inputs :
188
188
try :
@@ -195,7 +195,7 @@ def _generate_bge_vl_embeddings(self, inputs):
195
195
except Exception as e :
196
196
logger .error (f"Failed to encode image inputs: { e } " )
197
197
raise ValueError (f"BGE-VL image encoding failed: { e } " )
198
-
198
+
199
199
# Process multimodal inputs (text + image)
200
200
if multimodal_inputs :
201
201
for text , bytesio_image in multimodal_inputs :
@@ -209,7 +209,7 @@ def _generate_bge_vl_embeddings(self, inputs):
209
209
except Exception as e :
210
210
logger .error (f"Failed to encode multimodal input: { e } " )
211
211
raise ValueError (f"BGE-VL multimodal encoding failed: { e } " )
212
-
212
+
213
213
return all_embeddings
214
214
215
215
def invoke (self , request :dict ):
@@ -219,7 +219,7 @@ def invoke(self, request:dict):
219
219
220
220
logger .info (f'request: { request } ' )
221
221
t0 = time .time ()
222
-
222
+
223
223
if self .is_bge_vl :
224
224
# Use BGE-VL multimodal processing
225
225
embeddings_list = self ._generate_bge_vl_embeddings (inputs )
@@ -229,10 +229,10 @@ def invoke(self, request:dict):
229
229
truncate_dim = request .get ('truncate_dim' , None )
230
230
embeddings = self .model .encode (inputs , task = task , truncate_dim = truncate_dim )
231
231
embeddings_list = embeddings .tolist ()
232
-
232
+
233
233
logger .info (f'embeddings generated, count: { len (embeddings_list )} , elapsed time: { time .time ()- t0 } ' )
234
234
return self .format_openai_response (embeddings_list )
235
-
235
+
236
236
async def ainvoke (self , request : dict ):
237
237
"""Async version of invoke method"""
238
238
return self .invoke (request )
0 commit comments