6
6
"""
7
7
8
8
__all__ = [
9
- "word_freqs" ,
10
- "unigram_word_freqs" ,
11
9
"bigram_word_freqs" ,
12
10
"trigram_word_freqs" ,
11
+ "unigram_word_freqs" ,
12
+ "word_freqs" ,
13
13
]
14
14
15
15
from collections import defaultdict
16
16
from typing import List , Tuple
17
17
18
18
from pythainlp .corpus import get_corpus , get_corpus_path
19
19
20
- _FILENAME = "tnc_freq.txt"
21
- _BIGRAM = "tnc_bigram_word_freqs"
22
- _TRIGRAM = "tnc_trigram_word_freqs"
20
+ _UNIGRAM_FILENAME = "tnc_freq.txt"
21
+ _BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
22
+ _TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"
23
23
24
24
25
25
def word_freqs () -> List [Tuple [str , int ]]:
@@ -30,53 +30,61 @@ def word_freqs() -> List[Tuple[str, int]]:
30
30
31
31
Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
32
32
"""
33
- lines = list ( get_corpus ( _FILENAME ))
34
- word_freqs = []
33
+ freqs : list [ tuple [ str , int ]] = []
34
+ lines = list ( get_corpus ( _UNIGRAM_FILENAME ))
35
35
for line in lines :
36
36
word_freq = line .split ("\t " )
37
37
if len (word_freq ) >= 2 :
38
- word_freqs .append ((word_freq [0 ], int (word_freq [1 ])))
38
+ freqs .append ((word_freq [0 ], int (word_freq [1 ])))
39
39
40
- return word_freqs
40
+ return freqs
41
41
42
42
43
- def unigram_word_freqs () -> defaultdict :
43
+ def unigram_word_freqs () -> dict [ str , int ] :
44
44
"""
45
45
Get unigram word frequency from Thai National Corpus (TNC)
46
46
"""
47
- lines = list ( get_corpus ( _FILENAME ) )
48
- _word_freqs = defaultdict ( int )
47
+ freqs : dict [ str , int ] = defaultdict ( int )
48
+ lines = list ( get_corpus ( _UNIGRAM_FILENAME ) )
49
49
for i in lines :
50
50
_temp = i .strip ().split (" " )
51
51
if len (_temp ) >= 2 :
52
- _word_freqs [_temp [0 ]] = int (_temp [- 1 ])
52
+ freqs [_temp [0 ]] = int (_temp [- 1 ])
53
53
54
- return _word_freqs
54
+ return freqs
55
55
56
56
57
- def bigram_word_freqs () -> defaultdict :
57
+ def bigram_word_freqs () -> dict [ Tuple [ str , str ], int ] :
58
58
"""
59
59
Get bigram word frequency from Thai National Corpus (TNC)
60
60
"""
61
- _path = get_corpus_path (_BIGRAM )
62
- _word_freqs = defaultdict (int )
63
- with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
61
+ freqs : dict [tuple [str , str ], int ] = defaultdict (int )
62
+ path = get_corpus_path (_BIGRAM_CORPUS_NAME )
63
+ if not path :
64
+ return freqs
65
+ path = str (path )
66
+
67
+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
64
68
for i in fh .readlines ():
65
- _temp = i .strip ().split (" " )
66
- _word_freqs [( _temp [0 ], _temp [1 ])] = int (_temp [- 1 ])
69
+ temp = i .strip ().split (" " )
70
+ freqs [( temp [0 ], temp [1 ])] = int (temp [- 1 ])
67
71
68
- return _word_freqs
72
+ return freqs
69
73
70
74
71
- def trigram_word_freqs () -> defaultdict :
75
+ def trigram_word_freqs () -> dict [ Tuple [ str , str , str ], int ] :
72
76
"""
73
77
Get trigram word frequency from Thai National Corpus (TNC)
74
78
"""
75
- _path = get_corpus_path (_TRIGRAM )
76
- _word_freqs = defaultdict (int )
77
- with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
79
+ freqs : dict [tuple [str , str , str ], int ] = defaultdict (int )
80
+ path = get_corpus_path (_TRIGRAM_CORPUS_NAME )
81
+ if not path :
82
+ return freqs
83
+ path = str (path )
84
+
85
+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
78
86
for i in fh .readlines ():
79
- _temp = i .strip ().split (" " )
80
- _word_freqs [( _temp [0 ], _temp [1 ], _temp [2 ])] = int (_temp [- 1 ])
87
+ temp = i .strip ().split (" " )
88
+ freqs [( temp [0 ], temp [1 ], temp [2 ])] = int (temp [- 1 ])
81
89
82
- return _word_freqs
90
+ return freqs
0 commit comments