|
7 | 7 |
|
8 | 8 | class TestTokenizeFunction(unittest.TestCase): |
9 | 9 | def setUp(self): |
10 | | - self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') |
| 10 | + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") |
11 | 11 | self.config = { |
12 | | - 'gpt_base_model': True, |
13 | | - 'max_length': 512, |
14 | | - 'trust_remote_code': False, |
15 | | - 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " |
16 | | - "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" |
17 | | - "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" |
18 | | - "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " |
19 | | - "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " |
20 | | - "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " |
21 | | - "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " |
22 | | - "End \n'}}", |
| 12 | + "gpt_base_model": True, |
| 13 | + "max_length": 512, |
| 14 | + "trust_remote_code": False, |
| 15 | + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " |
| 16 | + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" |
| 17 | + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" |
| 18 | + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " |
| 19 | + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " |
| 20 | + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " |
| 21 | + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " |
| 22 | + "End \n'}}", |
23 | 23 | } |
24 | 24 | self.processer = GeneralProcesser(self.config) |
25 | 25 |
|
26 | 26 | def test_tokenize_function_with_gpt_model(self): |
27 | | - self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') |
| 27 | + self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") |
28 | 28 |
|
29 | | - examples = \ |
30 | | - { |
31 | | - "instruction": "Test instruction", |
32 | | - "response": "Test response", |
33 | | - "context": "Test context", |
34 | | - } |
| 29 | + examples = { |
| 30 | + "instruction": "Test instruction", |
| 31 | + "response": "Test response", |
| 32 | + "context": "Test context", |
| 33 | + } |
35 | 34 |
|
36 | 35 | # Verify the format of the result |
37 | | - expected_result = 'Below is an instruction that describes a task. Write a response that '\ |
38 | | - 'appropriately completes the request.\n'\ |
39 | | - '\n'\ |
40 | | - '### Instruction:\n'\ |
41 | | - 'Test instruction\n'\ |
42 | | - '\n'\ |
43 | | - 'Input:\n'\ |
44 | | - 'Test context\n'\ |
45 | | - '\n'\ |
46 | | - '### Response:\n'\ |
47 | | - 'Test response\n'\ |
48 | | - '\n'\ |
49 | | - '### End' |
| 36 | + expected_result = ( |
| 37 | + "Below is an instruction that describes a task. Write a response that " |
| 38 | + "appropriately completes the request.\n" |
| 39 | + "\n" |
| 40 | + "### Instruction:\n" |
| 41 | + "Test instruction\n" |
| 42 | + "\n" |
| 43 | + "Input:\n" |
| 44 | + "Test context\n" |
| 45 | + "\n" |
| 46 | + "### Response:\n" |
| 47 | + "Test response\n" |
| 48 | + "\n" |
| 49 | + "### End" |
| 50 | + ) |
50 | 51 |
|
51 | 52 | result = self.processer.tokenize_function(examples, self.tokenizer) |
52 | | - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 53 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
53 | 54 |
|
54 | 55 | def test_tokenize_function_with_custom_chat_template(self): |
55 | | - examples = \ |
56 | | - { |
57 | | - "instruction": "Test instruction", |
58 | | - "response": "Test response", |
59 | | - "context": "Test context", |
60 | | - } |
| 56 | + examples = { |
| 57 | + "instruction": "Test instruction", |
| 58 | + "response": "Test response", |
| 59 | + "context": "Test context", |
| 60 | + } |
61 | 61 |
|
62 | 62 | # Verify the format of the result |
63 | | - expected_result = '<|im_start|>user\n' \ |
64 | | - '###Instruction:\n' \ |
65 | | - 'Test instruction\n' \ |
66 | | - '\n' \ |
67 | | - '###context:\n' \ |
68 | | - 'Test context\n' \ |
69 | | - '\n' \ |
70 | | - '<|im_end|><|im_start|>assistant\n' \ |
71 | | - 'Test response\n' \ |
72 | | - '\n' \ |
73 | | - '<|im_end|>' |
| 63 | + expected_result = ( |
| 64 | + "<|im_start|>user\n" |
| 65 | + "###Instruction:\n" |
| 66 | + "Test instruction\n" |
| 67 | + "\n" |
| 68 | + "###context:\n" |
| 69 | + "Test context\n" |
| 70 | + "\n" |
| 71 | + "<|im_end|><|im_start|>assistant\n" |
| 72 | + "Test response\n" |
| 73 | + "\n" |
| 74 | + "<|im_end|>" |
| 75 | + ) |
74 | 76 | # Set custom chat template |
75 | | - self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ |
76 | | - "+ message['content'] + '<|im_end|>'}}{% endfor %}" |
| 77 | + self.config["custom_chat_template"] = ( |
| 78 | + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" |
| 79 | + "+ message['content'] + '<|im_end|>'}}{% endfor %}" |
| 80 | + ) |
77 | 81 |
|
78 | | - self.config['gpt_base_model'] = False |
| 82 | + self.config["gpt_base_model"] = False |
79 | 83 | result = self.processer.tokenize_function(examples, self.tokenizer) |
80 | | - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 84 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
81 | 85 |
|
82 | 86 | def test_tokenize_function_with_chat_template(self): |
83 | | - examples = \ |
84 | | - { |
85 | | - "instruction": "Test instruction", |
86 | | - "response": "Test response", |
87 | | - "context": "Test context", |
88 | | - } |
| 87 | + examples = { |
| 88 | + "instruction": "Test instruction", |
| 89 | + "response": "Test response", |
| 90 | + "context": "Test context", |
| 91 | + } |
89 | 92 |
|
90 | 93 | # Verify the format of the result |
91 | | - expected_result = 'Below is an instruction that describes a task. Write a response that '\ |
92 | | - 'appropriately completes the request\n'\ |
93 | | - '### Instruction: ###Instruction:\n'\ |
94 | | - 'Test instruction\n'\ |
95 | | - '\n'\ |
96 | | - '###context:\n'\ |
97 | | - 'Test context\n'\ |
98 | | - '\n'\ |
99 | | - '### Response: Test response\n'\ |
100 | | - '\n'\ |
101 | | - '### End \n'\ |
102 | | - |
103 | | - self.config['gpt_base_model'] = False |
| 94 | + expected_result = ( |
| 95 | + "Below is an instruction that describes a task. Write a response that " |
| 96 | + "appropriately completes the request\n" |
| 97 | + "### Instruction: ###Instruction:\n" |
| 98 | + "Test instruction\n" |
| 99 | + "\n" |
| 100 | + "###context:\n" |
| 101 | + "Test context\n" |
| 102 | + "\n" |
| 103 | + "### Response: Test response\n" |
| 104 | + "\n" |
| 105 | + "### End \n" |
| 106 | + ) |
| 107 | + self.config["gpt_base_model"] = False |
104 | 108 | result = self.processer.tokenize_function(examples, self.tokenizer) |
105 | | - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 109 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
106 | 110 |
|
107 | 111 | def test_tokenize_function_with_default_chat_template(self): |
108 | | - self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') |
109 | | - examples = \ |
110 | | - { |
111 | | - "instruction": "Test instruction", |
112 | | - "response": "Test response", |
113 | | - "context": "Test context", |
114 | | - } |
| 112 | + self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") |
| 113 | + examples = { |
| 114 | + "instruction": "Test instruction", |
| 115 | + "response": "Test response", |
| 116 | + "context": "Test context", |
| 117 | + } |
115 | 118 |
|
116 | 119 | chat_example = [ |
117 | 120 | { |
118 | 121 | "role": "user", |
119 | 122 | "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", |
120 | | - |
121 | 123 | }, |
122 | 124 | { |
123 | 125 | "role": "assistant", |
124 | 126 | "content": "Test response\n\n", |
125 | | - } |
| 127 | + }, |
126 | 128 | ] |
127 | 129 |
|
128 | 130 | # Verify the format of the result |
129 | | - expected_result = self.tokenizer.apply_chat_template(chat_example, |
130 | | - tokenize=False, |
131 | | - max_length=self.config.get("max_length")) |
| 131 | + expected_result = self.tokenizer.apply_chat_template( |
| 132 | + chat_example, tokenize=False, max_length=self.config.get("max_length") |
| 133 | + ) |
132 | 134 |
|
133 | | - self.config['gpt_base_model'] = False |
| 135 | + self.config["gpt_base_model"] = False |
134 | 136 | result = self.processer.tokenize_function(examples, self.tokenizer) |
135 | | - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 137 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
136 | 138 |
|
137 | 139 |
|
138 | | -if __name__ == '__main__': |
| 140 | +if __name__ == "__main__": |
139 | 141 | unittest.main() |
0 commit comments