@@ -104,6 +104,7 @@ public class PTBTokenizerTest {
104104 // the space is because some weirdness happens having an
105105 // unmatched surrogate at the end of a text
106106 "half codepoint:" + ((char ) 55296 ) + " " ,
107+ "There are ,2 days left" ,
107108 };
108109
109110 private final String [][] ptbGold = {
@@ -201,6 +202,7 @@ public class PTBTokenizerTest {
201202 { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae" , "." , "asdf" , "?" },
202203 { "two" , "character" , "codepoint" , ":" , "😸" },
203204 { "half" , "codepoint" , ":" , },
205+ { "There" , "are" , "," , "2" , "days" , "left" , },
204206 };
205207
206208 private final String [][] ptbGoldSplitHyphenated = {
@@ -306,6 +308,7 @@ public class PTBTokenizerTest {
306308 { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae" , "." , "asdf" , "?" },
307309 { "two" , "character" , "codepoint" , ":" , "😸" },
308310 { "half" , "codepoint" , ":" , },
311+ { "There" , "are" , "," , "2" , "days" , "left" , },
309312 };
310313
311314 @ Test
0 commit comments