@@ -210,6 +210,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
210210 origin_model_path = self ._base_models [model_name ]["model_id_or_path" ]
211211 tokenizer_path = self ._base_models [model_name ]["tokenizer_name_or_path" ]
212212 gpt_base_model = self ._base_models [model_name ].get ("gpt_base_model" )
213+ last_gpt_base_model = False
213214 finetuned_model_path = os .path .join (self .finetuned_model_path , model_name , new_model_name )
214215 finetuned_checkpoint_path = os .path .join (self .finetuned_checkpoint_path , model_name , new_model_name ) if self .finetuned_checkpoint_path != "" else None
215216
@@ -221,7 +222,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
221222 ray_resources = ray .available_resources ()
222223 if "CPU" not in ray_resources or cpus_per_worker * worker_num + 1 > int (ray .available_resources ()["CPU" ]):
223224 raise gr .Error ("Resources are not meeting the demand" )
224- if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or gpt_base_model :
225+ if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or not ( gpt_base_model and last_gpt_base_model ) :
225226 ray .shutdown ()
226227 new_ray_init_config = {
227228 "runtime_env" : {
@@ -239,6 +240,9 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
239240 }
240241 if gpt_base_model :
241242 new_ray_init_config ["runtime_env" ]["pip" ] = ["transformers==4.26.0" ]
243+ else :
244+ new_ray_init_config ["runtime_env" ]["pip" ] = ["transformers==4.31.0" ]
245+ last_gpt_base_model = gpt_base_model
242246 finetune_config ["Training" ]["num_training_workers" ] = int (worker_num )
243247 finetune_config ["Training" ]["resources_per_worker" ]["CPU" ] = int (cpus_per_worker )
244248
@@ -296,7 +300,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
296300 "port" : "8000" ,
297301 "name" : new_model_name ,
298302 "route_prefix" : "/" + new_model_name ,
299- "chat_model " : self ._base_models [model_name ]["chat_model " ],
303+ "chat_processor " : self ._base_models [model_name ]["chat_processor " ],
300304 "prompt" : {
301305 "intro" : "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n " ,
302306 "human_id" : "\n ### Instruction" ,
@@ -342,15 +346,18 @@ def deploy_func(self, model_name: str, replica_num: int, cpus_per_worker: int):
342346 model_config = self ._all_models [model_name ]
343347 print ("model path: " , model_config ["model_id_or_path" ])
344348
345- chat_model = getattr (sys .modules [__name__ ], model_config ["chat_model " ], None )
346- if chat_model is None :
347- return model_name + " deployment failed. " + model_config ["chat_model " ] + " does not exist."
348- self .process_tool = chat_model (** model_config ["prompt" ])
349+ chat_processor = getattr (sys .modules [__name__ ], model_config ["chat_processor " ], None )
350+ if chat_processor is None :
351+ return model_name + " deployment failed. " + model_config ["chat_processor " ] + " does not exist."
352+ self .process_tool = chat_processor (** model_config ["prompt" ])
349353
350354 model_load_config = model_config .get ("config" , {})
351355 device_name = "cpu"
352- deployment = PredictDeployment .options (num_replicas = replica_num , ray_actor_options = {"num_cpus" : cpus_per_worker , "runtime_env" : {"pip" : ["transformers==4.28.0" ]}})\
353- .bind (model_config ["model_id_or_path" ], model_config ["tokenizer_name_or_path" ], model_load_config , device_name , amp_enabled , amp_dtype , stop_words = stop_words , cpus_per_worker = cpus_per_worker )
356+ deployment = PredictDeployment .options (num_replicas = replica_num , ray_actor_options = {"num_cpus" : cpus_per_worker , "runtime_env" : {"pip" : ["transformers==4.31.0" ]}})\
357+ .bind (model_config ["model_id_or_path" ], model_config ["tokenizer_name_or_path" ], model_load_config ,
358+ device_name , amp_enabled , amp_dtype ,
359+ chat_processor_name = model_config ["chat_processor" ], prompt = model_config ["prompt" ],
360+ cpus_per_worker = cpus_per_worker )
354361 handle = serve .run (deployment , _blocking = True , port = model_config ["port" ], name = model_config ["name" ], route_prefix = model_config ["route_prefix" ])
355362 return self .ip_port + model_config ["route_prefix" ]
356363
@@ -379,8 +386,8 @@ def get_cpu_memory(self, index):
379386 out = stdout .read ().decode ('utf-8' )
380387 out_words = out .split (" " )
381388 cpu_value = 100 - float (out_words [7 ])
382- total_memory = int (out_words [20 ].split ('+' )[0 ])
383- free_memory = int (out_words [21 ].split ('+' )[0 ])
389+ total_memory = float (out_words [20 ].split ('+' )[0 ])
390+ free_memory = float (out_words [21 ].split ('+' )[0 ])
384391 used_memory = 1 - free_memory / total_memory
385392 return cpu_memory_html .format (str (round (cpu_value , 1 )), str (round (used_memory * 100 , 1 )))
386393
@@ -395,7 +402,7 @@ def kill_node(self, btn_txt, index):
395402 return "Start" , ""
396403 elif btn_txt == "Start" :
397404 index = int (index )
398- command = "conda activate " + self .conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self .master_ip_port + r""" --resources='{"special_hardware": 4 }'"""
405+ command = "conda activate " + self .conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self .master_ip_port + r""" --resources='{"special_hardware": 2 }'"""
399406 self .ssh_connect [index ].exec_command (command )
400407 self .ray_nodes [index ]["Alive" ] = "True"
401408 time .sleep (2 )
@@ -428,6 +435,7 @@ def _init_ui(self):
428435 with gr .Blocks (css = custom_css ,title = title ) as gr_chat :
429436 head_content = """
430437 <div style="color: #fff;text-align: center;">
438+ <div style="position:absolute; left:15px; top:15px; "><img src="/file=inference/ui_images/logo.png" width="50" height="50"/></div>
431439 <p style="color: #fff; font-size: 1.0rem;">LLM on Ray Workflow as a Service Demo</p>
432440 <p style="color: #fff; font-size: 0.8rem;">Build your own LLM models with proprietary data, deploy an online inference service in production, all in a few simple clicks.</p>
433441 </div>
0 commit comments