File tree Expand file tree Collapse file tree 2 files changed +27
-27
lines changed
Expand file tree Collapse file tree 2 files changed +27
-27
lines changed Original file line number Diff line number Diff line change @@ -67,7 +67,7 @@ def test_corpus(self):
6767 def test_collation (self ):
6868 self .assertEqual (collation (['ไก่' ,'กก' ]),[u'กก' , u'ไก่' ])
6969 def test_normalize (self ):
70- self .assertEqual (normalize (u "เเปลก" )== u "แปลก", True )
70+ self .assertEqual (normalize ("เเปลก" ), "แปลก" )
7171 def test_keywords (self ):
7272 self .assertEqual (find_keyword (word_tokenize ("แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว" ,engine = 'newmm' )),{u'แมว' : 4 })
7373 def test_tag (self ):
Original file line number Diff line number Diff line change @@ -23,33 +23,33 @@ def trigram(token):
2323 '''
2424 return ngrams (token ,3 )
2525rule1 = [
26- "ะ" ,
27- "ั" ,
28- "็" ,
29- "า" ,
30- "ิ" ,
31- "่" ,
32- "ํ" ,
33- "ุ" ,
34- "ู" ,
35- "ใ" ,
36- "ไ" ,
37- "โ" ,
38- "ื"
39- "่" ,
40- "้" ,
41- "๋" ,
42- "๊" ,
43- "ึ" ,
44- "์" ,
45- "๋" ,
46- "ำ"
26+ u "ะ" ,
27+ u "ั" ,
28+ u "็" ,
29+ u "า" ,
30+ u "ิ" ,
31+ u "่" ,
32+ u "ํ" ,
33+ u "ุ" ,
34+ u "ู" ,
35+ u "ใ" ,
36+ u "ไ" ,
37+ u "โ" ,
38+ u "ื"
39+ u "่" ,
40+ u "้" ,
41+ u "๋" ,
42+ u "๊" ,
43+ u "ึ" ,
44+ u "์" ,
45+ u "๋" ,
46+ u "ำ"
4747] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
4848rule2 = [
49- ("เเ" ,"แ" ),
50- ("ํ้า" ,"้ำ" ),
51- ("ํา้" ,"้ำ" ),
52- ("้ั" ,"ั้" )
49+ (u "เเ" ,u "แ" ),
50+ (u "ํ้า" ,u "้ำ" ),
51+ (u "ํา้" ,u "้ำ" ),
52+ (u "้ั" ,u "ั้" )
5353] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
5454def normalize (text ):
5555 """
@@ -61,5 +61,5 @@ def normalize(text):
6161 True
6262 """
6363 for data in rule2 + list (zip (rule1 ,rule1 )):
64- text = re .sub (data [0 ]+ "+" ,data [1 ],text )
64+ text = re .sub (data [0 ]+ "+" ,data [1 ],text , re . U )
6565 return text
You can’t perform that action at this time.
0 commit comments