Skip to content

Commit 1af303d

Browse files
committed
add Cutkum Word Segmentation
Cutkum : https://github.com/pucktada/cutkum
1 parent 4a979d9 commit 1af303d

File tree

2 files changed

+34
-1
lines changed

2 files changed

+34
-1
lines changed

pythainlp/tokenize/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ def word_tokenize(text,engine='newmm'):
8989
ใช้ Deep Neural Network ในการตัดคำภาษาไทย
9090
'''
9191
from .deepcut import segment
92+
elif engine=='cutkum':
93+
'''
94+
ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
95+
'''
96+
from .cutkum import segment
9297
elif engine=='wordcutpy':
9398
'''
9499
wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
@@ -161,4 +166,4 @@ def syllable_tokenize(text1):
161166
i+=1
162167
else:
163168
data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
164-
return data
169+
return data

pythainlp/tokenize/cutkum.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import absolute_import,unicode_literals
3+
import sys
4+
import os
5+
try:
6+
from cutkum.tokenizer import Cutkum
7+
except ImportError:
8+
'''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ'''
9+
import pip
10+
pip.main(['install','cutkum'])
11+
try:
12+
from cutkum.tokenizer import Cutkum
13+
except ImportError:
14+
sys.exit('Error ! using pip install cutkum')
15+
def get_model():
16+
path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
17+
if not os.path.exists(path):
18+
os.makedirs(path)
19+
path = os.path.join(path, 'lstm.l6.d2.pb')
20+
if not os.path.exists(path):
21+
print("Download models...")
22+
from urllib import request
23+
request.urlretrieve("https://raw.githubusercontent.com/pucktada/cutkum/master/model/lstm.l6.d2.pb",path)
24+
print("OK.")
25+
return path
26+
ck = Cutkum(get_model())
27+
def segment(text):
28+
return ck.tokenize(text)

0 commit comments

Comments
 (0)