new thai word segmentation by @korakot & thai2vec

wannaphong · wannaphong · commit b7c624c9fa65 · 2018-02-04T17:42:18.000+07:00
- new thai word segmentation using maximal matching by @korakot - thai2vec by @cstorm125
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -1,130 +1,127 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
-'''
-โปรแกรม multi-cut
-ตัดคำภาษาไทยโดยใช้ Maximum Matching algorithm
-เดติดโค้ดต้นฉบับ คุณ Korakot Chaovavanich
-จาก https://www.facebook.com/groups/408004796247683/permalink/431283740586455/
-และ https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716
+'''ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ TCC
+พัฒนาโดยคุณ Korakot Chaovavanich
+Notebook : https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
 '''
-import six
-if six.PY2:
-	from builtins import *
+from __future__ import absolute_import,unicode_literals
 import re
-import copy
-from pythainlp.tools import file_trie
-from marisa_trie import Trie
 from collections import defaultdict
-from pythainlp.tokenize import tcc
-class LatticeString(str):
-    ''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
-    '''
-    def __new__(cls, value, multi=None, in_dict=True):
-        return str.__new__(cls, value)
+from heapq import heappush, heappop  # for priority queue
+from marisa_trie import Trie
+from pythainlp.corpus.thaiword import get_data # ดึงข้อมูลรายการคำในภาษาไทย
 
-    def __init__(self, value, multi=None, in_dict=True):
-        self.unique = True
-        if multi:
-            self.multi = list(multi)
-            if len(self.multi) > 1:
-                self.unique = False
-        else:
-            self.multi = [value]
-        self.in_dict = in_dict   # บอกว่าเป็นคำมีในดิกหรือเปล่า
-spat_eng = r'''(?x)
-([\d,\.]\#)+|   # number
-([๑๒๓๔๕๖๗๘๙๐,\.]\#)+|   # thai number
-([a-zA-Z]\#)+|  # english
+
+# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
+pat_eng = re.compile(r'''(?x)
+[-a-zA-Z]+|   # english
+\d[\d,\.]*|   # number
 [ \t]+|       # space
 \r?\n         # newline
-'''
-pat_eng = re.compile(spat_eng)
-
-def multicut(text,data):
-    ''' ส่งคืน LatticeString คืนมาเป็นก้อนๆ
-    '''
-    words_at = defaultdict(list)  # main data structure
-    if data!="": # ถ้าหากกำหนดข้อมูลโดยใช้ dict ของตัวเอง
-        i=0
-        data2=copy.copy(data)
-        while i<len(data2):
-            data2[i]=tcc.tcc(data2[i],sep='#')
-            if(data2[len(data2[i])-1]!="#"):
-               data2[i]+="#"
-            i+=1
-        trie = Trie(data2)
-    else:
-        trie = file_trie(data="newmm")
-    def serialize(p, p2):    # helper function
-        for w in words_at[p]:
-            p_ = p + len(w)
-            if p_== p2:
-                yield w
-            elif p_ < p2:
-                for path in serialize(p_, p2):
-                    yield w+'/'+path
-    q = {0}
-    last_p = 0   # last position for yield
-    while min(q) < len(text):
-        p = min(q)
-        q -= {p}  # q.pop, but for set
+''')
+# TCC
+pat_tcc = """\
+เc็c
+เcctาะ
+เccีtยะ
+เccีtย(?=[เ-ไก-ฮ]|$)
+เccอะ
+เcc็c
+เcิc์c
+เcิtc
+เcีtยะ?
+เcืtอะ?
+เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
+เctา?ะ?
+cัtวะ
+c[ัื]tc[ุิะ]?
+c[ิุู]์
+c[ะ-ู]t
+c็
+ct[ะาำ]?
+แc็c
+แcc์
+แctะ
+แcc็c
+แccc์
+โctะ
+[เ-ไ]ct
+""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()
 
-        for w in trie.prefixes(text[p:]):
-            words_at[p].append(w)
-            q.add(p+len(w))
+def tcc(w):
+    p = 0
+    pat = re.compile("|".join(pat_tcc))
+    while p<len(w):
+        m = pat.match(w[p:])
+        if m:
+            n = m.span()[1]
+        else:
+            n = 1
+        yield w[p:p+n]
+        p += n
 
-        if len(q)==1:
-            q0 = min(q)
-            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
-            last_p = q0
+def tcc_pos(text):
+    p_set = set()
+    p = 0
+    for w in tcc(text):
+        p += len(w)
+        p_set.add(p)
+    return p_set
+def serialize(words_at, p, p2):   
+  # find path ทั้งหมด แบบ depth first
+  for w in words_at[p]:
+    p_ = p + len(w)
+    if p_== p2:
+      yield [w]
+    elif p_ < p2:
+      for path in serialize(words_at, p_, p2):
+        yield [w]+path
+def onecut(text,data=['']):
+  if(data!=['']):
+      trie = Trie(data)
+  else:
+      trie = Trie(get_data())
+  words_at = defaultdict(list)  # main data structure
+  allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc
+  
+  q = [0]       # min-heap queue
+  last_p = 0    # last position for yield
+  while q[0] < len(text):
+      p = heappop(q)
 
-        # กรณี len(q) == 0  คือ ไม่มีใน dict
-        elif len(q)==0:
-            m = pat_eng.match(text[p:])
-            if m!=None: # อังกฤษ, เลข, ว่าง
-                i = p + m.span()[1]
-            else: # skip น้อยที่สุด ที่เป็นไปได้
-                for i in range(p, len(text)):
-                    ww = trie.prefixes(text[i:])
-                    m = pat_eng.match(text[i:])
-                    if ww or m:
-                        break
-                else:
-                    i = len(text)
-            w = text[p:i]
+      for w in trie.prefixes(text[p:]):
+          p_ = p + len(w)
+          if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
             words_at[p].append(w)
-            yield LatticeString(w, in_dict=False)
-            last_p = i
-            q.add(i)
+            if p_ not in q:
+              heappush(q, p_)   
 
-def mmcut(text,data=''):
-    res = []
-    text=tcc.tcc(text,sep='#') # ให้นำข้อความมาผ่าน tcc
-    if(text[len(text)-1]!='#'): # ถ้าตัวสุดท้ายของสตริงไม่เป็น #
-        text+='#' # ให้เพิ่ม # เข้าไป
-    for w in multicut(text,data=data):
-        mm = min(w.multi, key=lambda x: x.count('/'))
-        res.extend(mm.split('/'))
-    return [x.replace('#','') for x in res if x!='#'] # เอา # ออก
-def combine(ww):
-    if ww == []:
-        yield ""
-    else:
-        w = ww[0]
-        for tail in combine(ww[1:]):
-            if w.unique:
-                yield w+"|"+tail
-            else:
-                for m in w.multi:
-                    yield m.replace("/","|")+"|"+tail
+      # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
+      if len(q)==1:
+          paths = serialize(words_at, last_p, q[0])
+          for w in min(paths, key=len):
+            yield w
+          last_p = q[0]
 
-def listcut(text,data=''):
-    '''
-	ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
-	'''
-    ww = list(multicut(text,data))
-    return list(combine(ww))
-if __name__ == "__main__":
-	text='ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด'
-	mmcut(text)
-	#print(listcut(text))
+      # กรณี length 0  คือ ไม่มีใน dict
+      if len(q)==0:
+          m = pat_eng.match(text[p:])
+          if m: # อังกฤษ, เลข, ว่าง
+              i = p + m.end()
+          else: # skip น้อยที่สุด ที่เป็นไปได้
+              for i in range(p+1, len(text)):
+                  if i in allow_pos:   # ใช้ tcc ด้วย
+                      ww = trie.prefixes(text[i:])
+                      m = pat_eng.match(text[i:])
+                      if ww or m:
+                          break
+              else:
+                  i = len(text)
+          w = text[p:i]
+          words_at[p].append(w)
+          yield w
+          last_p = i
+          heappush(q, i)
+          
+# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
+def mmcut(text,data=['']):
+	return list(onecut(text,data=data))
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -33,7 +33,7 @@ def file_trie(data):
 			from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า
 			data=get_data()
 		else:
-			from pythainlp.corpus.newthaiword import get_data # ข้อมูลเก่า
+			from pythainlp.corpus.newthaiword import get_data # ข้อมูลใหม่
 			data=get_data()
 		with open(path,'wb') as dill_file:
 			dill.dump(marisa_trie.Trie(data),dill_file)
diff --git a/pythainlp/word_vector/__init__.py b/pythainlp/word_vector/__init__.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+'''
+Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
+'''
+from __future__ import absolute_import,unicode_literals
+import six
+import sys
+if six.PY2:
+	print("Thai sentiment in pythainlp. Not support python 2.7")
+	sys.exit(0)
+try:
+	from gensim.models import KeyedVectors
+	import numpy as np
+except ImportError:
+	import pip
+	pip.main(['install','gensim','numpy'])
+	try:
+		from gensim.models import KeyedVectors
+		import numpy as np
+	except ImportError:
+		print("Error ! using 'pip install gensim numpy'")
+		sys.exit(0)
+from pythainlp.tokenize import word_tokenize
+import os
+
+def download():
+	path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
+	if not os.path.exists(path):
+		os.makedirs(path)
+	path = os.path.join(path, 'thai2vec.vec')
+	if not os.path.exists(path):
+		print("Download models...")
+		from urllib import request
+		request.urlretrieve("https://github.com/cstorm125/thai2vec/raw/master/data/thaiwiki/models/thai2vec.vec",path)
+		print("OK.")
+	return path
+def get_model():
+	return KeyedVectors.load_word2vec_format(download(),binary=False)
+def most_similar_cosmul(positive,negative):
+	'''
+	การใช้งาน
+	input list
+	'''
+	return get_model().most_similar_cosmul(positive=positive, negative=negative)
+def doesnt_match(listdata):
+	return get_model().doesnt_match(listdata)
+def similarity(word1,word2):
+	return get_model().similarity(word1,word2)
+def sentence_vectorizer(ss,dim=300,use_mean=False):
+    s = word_tokenize(ss)
+    vec = np.zeros((1,dim))
+    for word in s:
+        if word in get_model().wv.index2word:
+            vec+= get_model().wv.word_vec(word)
+        else: pass
+    if use_mean: vec /= len(s)
+    return(vec)
+def about():
+	return '''
+	thai2vec
+	Language Modeling, Word2Vec and Text Classification in Thai Language. Created as part of pyThaiNLP.
+	
+	Development : Charin Polpanumas
+	GitHub : https://github.com/cstorm125/thai2vec
+	'''