Skip to content

Commit 68d4fdb

Browse files
committed
fix python2 in normalize
1 parent eee760e commit 68d4fdb

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

pythainlp/util/__init__.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import re
3+
import six
34
from nltk.util import ngrams as ngramsdata
45
def ngrams(token,num):
56
'''
@@ -49,7 +50,12 @@ def trigram(token):
4950
(u"เเ",u"แ"),
5051
(u"ํ(t)า",u"\\1ำ"),
5152
(u"ํา(t)",u"\\1ำ"),
52-
(u"([่-๋])([ัิ-ื])",u"\\2\\1")
53+
(u"([่-๋])([ัิ-ื])",u"\\2\\1")]
54+
rule2py2=[
55+
(u"เเ",u"แ"),
56+
(u"ํ(t)า",u"\1ำ"),
57+
(u"ํา(t)",u"\1ำ"),
58+
(u"([่-๋])([ัิ-ื])",u"\2\1")
5359
] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
5460
def normalize(text):
5561
"""
@@ -60,8 +66,12 @@ def normalize(text):
6066
>>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
6167
True
6268
"""
63-
for data in rule2:
64-
text=re.sub(data[0].replace("t","[่้๊๋]"),data[1],text,re.U)
69+
if six.PY2:
70+
for data in rule2py2:
71+
text=re.sub(data[0].replace(u"t",u"[่้๊๋]"),data[1],text,re.U)
72+
else:
73+
for data in rule2:
74+
text=re.sub(data[0].replace("t","[่้๊๋]"),data[1],text,re.U)
6575
for data in list(zip(rule1,rule1)):
66-
text=re.sub(data[0].replace("t","[่้๊๋]")+"+",data[1],text,re.U)
76+
text=re.sub(data[0].replace(u"t",u"[่้๊๋]")+"+",data[1],text,re.U)
6777
return text

0 commit comments

Comments
 (0)