add Cutkum Word Segmentation

wannaphong · wannaphong · commit 1af303d75d86 · 2018-02-17T00:49:07.000+07:00
Cutkum : https://github.com/pucktada/cutkum
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -89,6 +89,11 @@ def word_tokenize(text,engine='newmm'):
 			ใช้ Deep Neural Network ในการตัดคำภาษาไทย
 			'''
     		from .deepcut import segment
+	elif engine=='cutkum':
+    		'''
+			ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum)
+			'''
+    		from .cutkum import segment
 	elif engine=='wordcutpy':
     		'''
 			wordcutpy ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ
@@ -161,4 +166,4 @@ def syllable_tokenize(text1):
 			i+=1
 	else:
 		data=dict_word_tokenize(text=text1,data=get_data(),data_type="list")
-	return data
+	return data
diff --git a/pythainlp/tokenize/cutkum.py b/pythainlp/tokenize/cutkum.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import,unicode_literals
+import sys
+import os
+try:
+    from cutkum.tokenizer import Cutkum
+except ImportError:
+	'''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ'''
+	import pip
+	pip.main(['install','cutkum'])
+	try:
+		from cutkum.tokenizer import Cutkum
+	except ImportError:
+		sys.exit('Error ! using pip install cutkum')
+def get_model():
+	path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
+	if not os.path.exists(path):
+		os.makedirs(path)
+	path = os.path.join(path, 'lstm.l6.d2.pb')
+	if not os.path.exists(path):
+		print("Download models...")
+		from urllib import request
+		request.urlretrieve("https://raw.githubusercontent.com/pucktada/cutkum/master/model/lstm.l6.d2.pb",path)
+		print("OK.")
+	return path
+ck = Cutkum(get_model())
+def segment(text):
+    return ck.tokenize(text)