@@ -25,31 +25,35 @@ class WordTokenizer(object):
2525
2626 >>> s = '''population of 100,000'''
2727 >>> WordTokenizer().tokenize(s)
28- ['population', 'of', '100,000']
28+ ['population', 'of', '100000']
29+
30+ >>> s = '''unit 6,'''
31+ >>> WordTokenizer().tokenize(s)
32+ ['unit', '6', ',']
2933
3034 """
3135 def tokenize (self , text ):
32- #starting quotes
36+ # starting quotes
3337 text = re .sub (r'^\"' , r'``' , text )
3438 text = re .sub (r'(``)' , r' \1 ' , text )
3539 text = re .sub (r'([ (\[{<])"' , r'\1 `` ' , text )
3640
37- #punctuation
38- text = re .sub (r'(?<! \d)([,])' , r' \1 ' , text ) # CHANGED :
41+ # punctuation
42+ text = re .sub (r'(?<= \d)([,])(?=\d) ' , ' ' , text ) # remove ',' in digits
3943 text = re .sub (r'\.\.\.' , r' ... ' , text )
40- text = re .sub (r'[;#$%&]' , r' \g<0> ' , text ) # CHANGED @
44+ text = re .sub (r'[;#$%&, ]' , r' \g<0> ' , text ) # CHANGED @
4145
4246
4347 text = re .sub (r'([^\.])(\.)([\]\)}>"\']*)\s*$' , r'\1 \2\3 ' , text )
4448 text = re .sub (r'[?!]' , r' \g<0> ' , text )
4549
4650 text = re .sub (r"([^'])' " , r"\1 ' " , text )
4751
48- #parens, brackets, etc.
52+ # parens, brackets, etc.
4953 text = re .sub (r'[\]\[\(\)\{\}\<\>]' , r' \g<0> ' , text )
5054 text = re .sub (r'--' , r' -- ' , text )
5155
52- #add extra space to make things easier
56+ # add extra space to make things easier
5357 text = " " + text + " "
5458
5559 #ending quotes
0 commit comments