|
1 | 1 | <?php |
2 | 2 |
|
3 | | -function gpt_encode($text) |
4 | | -{ |
5 | | - $bpe_tokens = array(); |
6 | | - if(empty($text)) |
7 | | - { |
8 | | - return $bpe_tokens; |
9 | | - } |
10 | | - $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json"); |
11 | | - $byte_encoder = json_decode($raw_chars, true); |
12 | | - if(empty($byte_encoder)) |
13 | | - { |
14 | | - error_log('Failed to load characters.json: ' . $raw_chars); |
15 | | - return $bpe_tokens; |
16 | | - } |
17 | | - $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json"); |
18 | | - $encoder = json_decode($rencoder, true); |
19 | | - if(empty($encoder)) |
20 | | - { |
21 | | - error_log('Failed to load encoder.json: ' . $rencoder); |
22 | | - return $bpe_tokens; |
23 | | - } |
| 3 | +require_once __DIR__.'/vendor/autoload.php'; |
24 | 4 |
|
25 | | - $bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe"); |
26 | | - if(empty($bpe_file)) |
27 | | - { |
28 | | - error_log('Failed to load vocab.bpe'); |
29 | | - return $bpe_tokens; |
30 | | - } |
31 | | - |
32 | | - preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches); |
33 | | - if(!isset($matches[0]) || count($matches[0]) == 0) |
34 | | - { |
35 | | - error_log('Failed to match string: ' . $text); |
36 | | - return $bpe_tokens; |
37 | | - } |
38 | | - $lines = preg_split('/\r\n|\r|\n/', $bpe_file); |
39 | | - $bpe_merges = array(); |
40 | | - $bpe_merges_temp = array_slice($lines, 1, count($lines), true); |
41 | | - foreach($bpe_merges_temp as $bmt) |
42 | | - { |
43 | | - $split_bmt = preg_split('#(\s+)#', $bmt); |
44 | | - $split_bmt = array_filter($split_bmt, 'gpt_my_filter'); |
45 | | - if(count($split_bmt) > 0) |
46 | | - { |
47 | | - $bpe_merges[] = $split_bmt; |
48 | | - } |
49 | | - } |
50 | | - $bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1)); |
51 | | - |
52 | | - $cache = array(); |
53 | | - foreach($matches[0] as $token) |
54 | | - { |
55 | | - $new_tokens = array(); |
56 | | - $chars = array(); |
57 | | - $token = utf8_encode($token); |
58 | | - if(function_exists('mb_strlen')) |
59 | | - { |
60 | | - $len = mb_strlen($token, 'UTF-8'); |
61 | | - for ($i = 0; $i < $len; $i++) |
62 | | - { |
63 | | - $chars[] = mb_substr($token, $i, 1, 'UTF-8'); |
64 | | - } |
65 | | - } |
66 | | - else |
67 | | - { |
68 | | - $chars = str_split($token); |
69 | | - } |
70 | | - $result_word = ''; |
71 | | - foreach($chars as $char) |
72 | | - { |
73 | | - if(isset($byte_encoder[gpt_unichr($char)])) |
74 | | - { |
75 | | - $result_word .= $byte_encoder[gpt_unichr($char)]; |
76 | | - } |
77 | | - } |
78 | | - $new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache); |
79 | | - $new_tokens_bpe = explode(' ', $new_tokens_bpe); |
80 | | - foreach($new_tokens_bpe as $x) |
81 | | - { |
82 | | - if(isset($encoder[$x])) |
83 | | - { |
84 | | - $new_tokens[$x] = $encoder[$x]; |
85 | | - } |
86 | | - else |
87 | | - { |
88 | | - $new_tokens[$x] = $x; |
89 | | - } |
90 | | - } |
91 | | - foreach($new_tokens as $ninx => $nval) |
92 | | - { |
93 | | - if(isset($bpe_tokens[$ninx])) |
94 | | - { |
95 | | - $bpe_tokens[] = $nval; |
96 | | - } |
97 | | - else |
98 | | - { |
99 | | - $bpe_tokens[$ninx] = $nval; |
100 | | - } |
101 | | - } |
102 | | - } |
103 | | - return $bpe_tokens; |
104 | | -} |
105 | | - |
106 | | -function gpt_my_filter($var) |
107 | | -{ |
108 | | - return ($var !== NULL && $var !== FALSE && $var !== ''); |
109 | | -} |
110 | | - |
111 | | -function gpt_unichr($c) |
112 | | -{ |
113 | | - if (ord($c[0]) >=0 && ord($c[0]) <= 127) |
114 | | - { |
115 | | - return ord($c[0]); |
116 | | - } |
117 | | - if (ord($c[0]) >= 192 && ord($c[0]) <= 223) |
118 | | - { |
119 | | - return (ord($c[0])-192)*64 + (ord($c[1])-128); |
120 | | - } |
121 | | - if (ord($c[0]) >= 224 && ord($c[0]) <= 239) |
122 | | - { |
123 | | - return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128); |
124 | | - } |
125 | | - if (ord($c[0]) >= 240 && ord($c[0]) <= 247) |
126 | | - { |
127 | | - return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128); |
128 | | - } |
129 | | - if (ord($c[0]) >= 248 && ord($c[0]) <= 251) |
130 | | - { |
131 | | - return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128); |
132 | | - } |
133 | | - if (ord($c[0]) >= 252 && ord($c[0]) <= 253) |
134 | | - { |
135 | | - return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128); |
136 | | - } |
137 | | - if (ord($c[0]) >= 254 && ord($c[0]) <= 255) |
138 | | - { |
139 | | - return 0; |
140 | | - } |
141 | | - return 0; |
142 | | -} |
143 | | -function gpt_dictZip($x, $y) |
144 | | -{ |
145 | | - $result = array(); |
146 | | - $cnt = 0; |
147 | | - foreach($x as $i) |
148 | | - { |
149 | | - if(isset($i[1]) && isset($i[0])) |
150 | | - { |
151 | | - $result[$i[0] . ',' . $i[1]] = $cnt; |
152 | | - $cnt++; |
153 | | - } |
154 | | - } |
155 | | - return $result; |
156 | | -} |
157 | | -function gpt_get_pairs($word) |
158 | | -{ |
159 | | - $pairs = array(); |
160 | | - $prev_char = $word[0]; |
161 | | - for ($i = 1; $i < count($word); $i++) |
162 | | - { |
163 | | - $char = $word[$i]; |
164 | | - $pairs[] = array($prev_char, $char); |
165 | | - $prev_char = $char; |
166 | | - } |
167 | | - return $pairs; |
168 | | -} |
169 | | -function gpt_split($str, $len = 1) |
170 | | -{ |
171 | | - $arr = []; |
172 | | - if(function_exists('mb_strlen')) |
173 | | - { |
174 | | - $length = mb_strlen($str, 'UTF-8'); |
175 | | - } |
176 | | - else |
177 | | - { |
178 | | - $length = strlen($str); |
179 | | - } |
180 | | - |
181 | | - for ($i = 0; $i < $length; $i += $len) |
182 | | - { |
183 | | - if(function_exists('mb_substr')) |
184 | | - { |
185 | | - $arr[] = mb_substr($str, $i, $len, 'UTF-8'); |
186 | | - } |
187 | | - else |
188 | | - { |
189 | | - $arr[] = substr($str, $i, $len); |
190 | | - } |
191 | | - } |
192 | | - return $arr; |
193 | | - |
194 | | -} |
195 | | -function gpt_bpe($token, $bpe_ranks, &$cache) |
196 | | -{ |
197 | | - if(array_key_exists($token, $cache)) |
198 | | - { |
199 | | - return $cache[$token]; |
200 | | - } |
201 | | - $word = gpt_split($token); |
202 | | - $init_len = count($word); |
203 | | - $pairs = gpt_get_pairs($word); |
204 | | - if(!$pairs) |
205 | | - { |
206 | | - return $token; |
207 | | - } |
208 | | - while (true) |
209 | | - { |
210 | | - $minPairs = array(); |
211 | | - foreach($pairs as $pair) |
212 | | - { |
213 | | - if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks)) |
214 | | - { |
215 | | - $rank = $bpe_ranks[$pair[0] . ','. $pair[1]]; |
216 | | - $minPairs[$rank] = $pair; |
217 | | - } |
218 | | - else |
219 | | - { |
220 | | - $minPairs[10e10] = $pair; |
221 | | - } |
222 | | - } |
223 | | - ksort($minPairs); |
224 | | - $min_key = array_key_first($minPairs); |
225 | | - foreach($minPairs as $mpi => $mp) |
226 | | - { |
227 | | - if($mpi < $min_key) |
228 | | - { |
229 | | - $min_key = $mpi; |
230 | | - } |
231 | | - } |
232 | | - $bigram = $minPairs[$min_key]; |
233 | | - if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks)) |
234 | | - { |
235 | | - break; |
236 | | - } |
237 | | - $first = $bigram[0]; |
238 | | - $second = $bigram[1]; |
239 | | - $new_word = array(); |
240 | | - $i = 0; |
241 | | - while ($i < count($word)) |
242 | | - { |
243 | | - $j = gpt_indexOf($word, $first, $i); |
244 | | - if ($j === -1) |
245 | | - { |
246 | | - $new_word = array_merge($new_word, array_slice($word, $i, null, true)); |
247 | | - break; |
248 | | - } |
249 | | - if($i > $j) |
250 | | - { |
251 | | - $slicer = array(); |
252 | | - } |
253 | | - elseif($j == 0) |
254 | | - { |
255 | | - $slicer = array(); |
256 | | - } |
257 | | - else |
258 | | - { |
259 | | - $slicer = array_slice($word, $i, $j - $i, true); |
260 | | - } |
261 | | - $new_word = array_merge($new_word, $slicer); |
262 | | - if(count($new_word) > $init_len) |
263 | | - { |
264 | | - break; |
265 | | - } |
266 | | - $i = $j; |
267 | | - if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) |
268 | | - { |
269 | | - array_push($new_word, $first . $second); |
270 | | - $i = $i + 2; |
271 | | - } |
272 | | - else |
273 | | - { |
274 | | - array_push($new_word, $word[$i]); |
275 | | - $i = $i + 1; |
276 | | - } |
277 | | - } |
278 | | - if($word == $new_word) |
279 | | - { |
280 | | - break; |
281 | | - } |
282 | | - $word = $new_word; |
283 | | - if (count($word) === 1) |
284 | | - { |
285 | | - break; |
286 | | - } |
287 | | - else |
288 | | - { |
289 | | - $pairs = gpt_get_pairs($word); |
290 | | - } |
291 | | - } |
292 | | - $word = implode(' ', $word); |
293 | | - $cache[$token] = $word; |
294 | | - return $word; |
295 | | -} |
296 | | -function gpt_indexOf($arrax, $searchElement, $fromIndex) |
297 | | -{ |
298 | | - $index = 0; |
299 | | - foreach($arrax as $index => $value) |
300 | | - { |
301 | | - if($index < $fromIndex) |
302 | | - { |
303 | | - $index++; |
304 | | - continue; |
305 | | - } |
306 | | - if($value == $searchElement) |
307 | | - { |
308 | | - return $index; |
309 | | - } |
310 | | - $index++; |
311 | | - } |
312 | | - return -1; |
313 | | -} |
| 5 | +use CodeRevolutionPlugins\GPT3Encoder\Encoder; |
314 | 6 |
|
315 | 7 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890"; |
316 | | -$token_array = gpt_encode($prompt); |
| 8 | +$token_array = Encoder::instance()->decode($prompt); |
317 | 9 | error_log('Token array: ' . print_r($token_array, true)); |
318 | 10 | error_log('Count: ' . count($token_array)); |
319 | 11 |
|
|
0 commit comments