Skip to content

Commit de444fd

Browse files
committed
fix error python 2.7 normalize
1 parent 1689296 commit de444fd

File tree

2 files changed

+27
-27
lines changed

2 files changed

+27
-27
lines changed

pythainlp/test/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def test_corpus(self):
6767
def test_collation(self):
6868
self.assertEqual(collation(['ไก่','กก']),[u'กก', u'ไก่'])
6969
def test_normalize(self):
70-
self.assertEqual(normalize(u"เเปลก")==u"แปลก",True)
70+
self.assertEqual(normalize("เเปลก"),"แปลก")
7171
def test_keywords(self):
7272
self.assertEqual(find_keyword(word_tokenize("แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว",engine='newmm')),{u'แมว': 4})
7373
def test_tag(self):

pythainlp/util/__init__.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,33 +23,33 @@ def trigram(token):
2323
'''
2424
return ngrams(token,3)
2525
rule1=[
26-
"ะ",
27-
"ั",
28-
"็",
29-
"า",
30-
"ิ",
31-
"่",
32-
"ํ",
33-
"ุ",
34-
"ู",
35-
"ใ",
36-
"ไ",
37-
"โ",
38-
"ื"
39-
"่",
40-
"้",
41-
"๋",
42-
"๊",
43-
"ึ",
44-
"์",
45-
"๋",
46-
"ำ"
26+
u"ะ",
27+
u"ั",
28+
u"็",
29+
u"า",
30+
u"ิ",
31+
u"่",
32+
u"ํ",
33+
u"ุ",
34+
u"ู",
35+
u"ใ",
36+
u"ไ",
37+
u"โ",
38+
u"ื"
39+
u"่",
40+
u"้",
41+
u"๋",
42+
u"๊",
43+
u"ึ",
44+
u"์",
45+
u"๋",
46+
u"ำ"
4747
] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
4848
rule2=[
49-
("เเ","แ"),
50-
("ํ้า","้ำ"),
51-
("ํา้","้ำ"),
52-
("้ั","ั้")
49+
(u"เเ",u"แ"),
50+
(u"ํ้า",u"้ำ"),
51+
(u"ํา้",u"้ำ"),
52+
(u"้ั",u"ั้")
5353
] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
5454
def normalize(text):
5555
"""
@@ -61,5 +61,5 @@ def normalize(text):
6161
True
6262
"""
6363
for data in rule2+list(zip(rule1,rule1)):
64-
text=re.sub(data[0]+"+",data[1],text)
64+
text=re.sub(data[0]+"+",data[1],text,re.U)
6565
return text

0 commit comments

Comments
 (0)