Skip to content

Commit a422ac2

Browse files
committed
add pythainlp.word_vector.thai2vec
1 parent b7c624c commit a422ac2

File tree

3 files changed

+67
-65
lines changed

3 files changed

+67
-65
lines changed

pythainlp/tools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import dill
55
from pythainlp.tokenize import tcc
6+
import marisa_trie
67
def file_trie(data):
78
'''
89
ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie
@@ -18,7 +19,6 @@ def file_trie(data):
1819
path = os.path.join(path, 'pythainlp_trie2.data')
1920
if not os.path.exists(path):
2021
#ถ้าไม่มีไฟล์
21-
import marisa_trie
2222
if data=="newmm":
2323
from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า
2424
data2=get_data()

pythainlp/word_vector/__init__.py

Lines changed: 1 addition & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,2 @@
11
# -*- coding: utf-8 -*-
2-
'''
3-
Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
4-
'''
5-
from __future__ import absolute_import,unicode_literals
6-
import six
7-
import sys
8-
if six.PY2:
9-
print("Thai sentiment in pythainlp. Not support python 2.7")
10-
sys.exit(0)
11-
try:
12-
from gensim.models import KeyedVectors
13-
import numpy as np
14-
except ImportError:
15-
import pip
16-
pip.main(['install','gensim','numpy'])
17-
try:
18-
from gensim.models import KeyedVectors
19-
import numpy as np
20-
except ImportError:
21-
print("Error ! using 'pip install gensim numpy'")
22-
sys.exit(0)
23-
from pythainlp.tokenize import word_tokenize
24-
import os
25-
26-
def download():
27-
path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
28-
if not os.path.exists(path):
29-
os.makedirs(path)
30-
path = os.path.join(path, 'thai2vec.vec')
31-
if not os.path.exists(path):
32-
print("Download models...")
33-
from urllib import request
34-
request.urlretrieve("https://github.com/cstorm125/thai2vec/raw/master/data/thaiwiki/models/thai2vec.vec",path)
35-
print("OK.")
36-
return path
37-
def get_model():
38-
return KeyedVectors.load_word2vec_format(download(),binary=False)
39-
def most_similar_cosmul(positive,negative):
40-
'''
41-
การใช้งาน
42-
input list
43-
'''
44-
return get_model().most_similar_cosmul(positive=positive, negative=negative)
45-
def doesnt_match(listdata):
46-
return get_model().doesnt_match(listdata)
47-
def similarity(word1,word2):
48-
return get_model().similarity(word1,word2)
49-
def sentence_vectorizer(ss,dim=300,use_mean=False):
50-
s = word_tokenize(ss)
51-
vec = np.zeros((1,dim))
52-
for word in s:
53-
if word in get_model().wv.index2word:
54-
vec+= get_model().wv.word_vec(word)
55-
else: pass
56-
if use_mean: vec /= len(s)
57-
return(vec)
58-
def about():
59-
return '''
60-
thai2vec
61-
Language Modeling, Word2Vec and Text Classification in Thai Language. Created as part of pyThaiNLP.
62-
63-
Development : Charin Polpanumas
64-
GitHub : https://github.com/cstorm125/thai2vec
65-
'''
2+
from __future__ import absolute_import,unicode_literals

pythainlp/word_vector/thai2vec.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# -*- coding: utf-8 -*-
2+
'''
3+
Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
4+
'''
5+
from __future__ import absolute_import,unicode_literals
6+
import six
7+
import sys
8+
if six.PY2:
9+
print("Thai sentiment in pythainlp. Not support python 2.7")
10+
sys.exit(0)
11+
try:
12+
from gensim.models import KeyedVectors
13+
import numpy as np
14+
except ImportError:
15+
import pip
16+
pip.main(['install','gensim','numpy'])
17+
try:
18+
from gensim.models import KeyedVectors
19+
import numpy as np
20+
except ImportError:
21+
print("Error ! using 'pip install gensim numpy'")
22+
sys.exit(0)
23+
from pythainlp.tokenize import word_tokenize
24+
import os
25+
26+
def download():
27+
path = os.path.join(os.path.expanduser("~"), 'pythainlp-data')
28+
if not os.path.exists(path):
29+
os.makedirs(path)
30+
path = os.path.join(path, 'thai2vec.vec')
31+
if not os.path.exists(path):
32+
print("Download models...")
33+
from urllib import request
34+
request.urlretrieve("https://github.com/cstorm125/thai2vec/raw/master/data/thaiwiki/models/thai2vec.vec",path)
35+
print("OK.")
36+
return path
37+
def get_model():
38+
return KeyedVectors.load_word2vec_format(download(),binary=False)
39+
def most_similar_cosmul(positive,negative):
40+
'''
41+
การใช้งาน
42+
input list
43+
'''
44+
return get_model().most_similar_cosmul(positive=positive, negative=negative)
45+
def doesnt_match(listdata):
46+
return get_model().doesnt_match(listdata)
47+
def similarity(word1,word2):
48+
return get_model().similarity(word1,word2)
49+
def sentence_vectorizer(ss,dim=300,use_mean=False):
50+
s = word_tokenize(ss)
51+
vec = np.zeros((1,dim))
52+
for word in s:
53+
if word in get_model().wv.index2word:
54+
vec+= get_model().wv.word_vec(word)
55+
else: pass
56+
if use_mean: vec /= len(s)
57+
return(vec)
58+
def about():
59+
return '''
60+
thai2vec
61+
Language Modeling, Word2Vec and Text Classification in Thai Language. Created as part of pyThaiNLP.
62+
63+
Development : Charin Polpanumas
64+
GitHub : https://github.com/cstorm125/thai2vec
65+
'''

0 commit comments

Comments
 (0)