@@ -132,15 +132,19 @@ def create_data(self, examples):
132132 )
133133 else :
134134 new_messages = [
135+ {
136+ "role" : "system" ,
137+ "content" : INTRO_BLURB + "\n " ,
138+ },
135139 {
136140 "role" : "user" ,
137141 "content" : examples ["instruction" ]
138- + "\n \n "
142+ + "\n "
139143 + INPUT_KEY
140144 + examples ["context" ]
141- + "\n \n " ,
145+ + "\n " ,
142146 },
143- {"role" : "assistant" , "content" : examples ["response" ] + "\n \n " },
147+ {"role" : "assistant" , "content" : examples ["response" ] + "\n " },
144148 ]
145149
146150 return new_messages
@@ -162,7 +166,6 @@ def tokenize_func(self, tokenizer, message):
162166 message ,
163167 tokenize = False ,
164168 )
165- print (new_tokenizer )
166169 return tokenizer (
167170 new_tokenizer , add_special_tokens = False , max_length = self .config .get ("max_length" )
168171 )
@@ -251,21 +254,9 @@ def prepare_dataloader(self, tokenizer, dataset):
251254
252255
253256class SlimOrcaDataPreprocess (ChatDataPreprocess ):
254- chat_template = (
255- "{% for message in messages %}"
256- "{% if message['role'] == 'system' %}"
257- "{{ '### System: ' + message['content'] }}"
258- "{% elif message['role'] == 'user' %}"
259- "{{ '### User: ' + message['content'] }}"
260- "{% elif message['role'] == 'assistant' %}"
261- "{{ '### Assistant: ' + message['content'] }}"
262- "{% endif %}"
263- "{% endfor %}"
264- )
265257
266258 def __init__ (self , config ):
267259 super ().__init__ (config )
268- self .config ["chat_template" ] = self .chat_template
269260 self .default_system = "You are a helpful, respectful and honest assistant."
270261
271262 def create_data (self , data ):
@@ -286,22 +277,26 @@ def create_data(self, data):
286277 examples [conv [j ]["from" ]] = conv [j ]["value" ]
287278 examples [conv [j + 1 ]["from" ]] = conv [j + 1 ]["value" ]
288279
289- new_messages = [
290- {"role" : "system" , "content" : examples ["system" ] + "\n " },
291- {
292- "role" : "user" ,
293- "content" : examples ["human" ] + "\n " ,
294- },
295- {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
296- ]
297280 if self .config .get ("gpt_base_model" ):
298281 if examples ["human" ]:
299- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
300- system = examples ["system" ], user = examples ["human " ], gpt = examples ["gpt " ]
282+ return PROMPT_WITH_INPUT_FORMAT .format (
283+ instruction = examples ["system" ], response = examples ["gpt " ], input = examples ["human " ]
301284 )
302285 else :
303- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
304- system = examples ["human " ], gpt = examples ["gpt" ]
286+ return PROMPT_NO_INPUT_FORMAT .format (
287+ instruction = examples ["system " ], response = examples ["gpt" ]
305288 )
306289 else :
290+ new_messages = [
291+ {"role" : "system" , "content" : INTRO_BLURB + "\n " },
292+ {
293+ "role" : "user" ,
294+ "content" : examples ["system" ]
295+ + "\n "
296+ + INPUT_KEY
297+ + examples ["human" ]
298+ + "\n " ,
299+ },
300+ {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
301+ ]
307302 return new_messages
0 commit comments