Skip to content

Commit ed83a18

Browse files
authored
Merge pull request #978 from bact/add-generate-test
Fix warnings and types
2 parents cf3e625 + 8409f8f commit ed83a18

File tree

15 files changed

+299
-240
lines changed

15 files changed

+299
-240
lines changed

.github/workflows/unittest.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ jobs:
2020
strategy:
2121
fail-fast: false
2222
matrix:
23-
os: ["macos-latest", "ubuntu-latest", "windows-latest"]
24-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
23+
os: ["ubuntu-latest", "windows-latest", "macos-latest"]
24+
python-version: ["3.13", "3.12", "3.11", "3.10", "3.9"]
2525

2626
runs-on: ${{ matrix.os }}
2727
env:

pythainlp/corpus/common.py

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
Common lists of words.
77
"""
88

9+
import ast
10+
911
__all__ = [
1012
"countries",
1113
"find_synonyms",
@@ -56,9 +58,9 @@
5658

5759
_THAI_ORST_WORDS: FrozenSet[str] = frozenset()
5860

59-
_THAI_DICT = {}
60-
_THAI_WSD_DICT = {}
61-
_THAI_SYNONYMS = {}
61+
_THAI_DICT: dict[str, list] = {}
62+
_THAI_WSD_DICT: dict[str, list] = {}
63+
_THAI_SYNONYMS: dict[str, list] = {}
6264

6365

6466
def countries() -> FrozenSet[str]:
@@ -268,17 +270,22 @@ def thai_dict() -> dict:
268270
:rtype: dict
269271
"""
270272
global _THAI_DICT
271-
if not _THAI_DICT:
272-
import csv
273-
274-
_THAI_DICT = {"word": [], "meaning": []}
275-
with open(
276-
get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
277-
) as csvfile:
278-
reader = csv.DictReader(csvfile, delimiter=",")
279-
for row in reader:
280-
_THAI_DICT["word"].append(row["word"])
281-
_THAI_DICT["meaning"].append(row["meaning"])
273+
if _THAI_DICT:
274+
return _THAI_DICT
275+
276+
import csv
277+
278+
path = get_corpus_path("thai_dict")
279+
if not path:
280+
return _THAI_DICT
281+
path = str(path)
282+
283+
_THAI_DICT = {"word": [], "meaning": []}
284+
with open(path, newline="\n", encoding="utf-8") as csvfile:
285+
reader = csv.DictReader(csvfile, delimiter=",")
286+
for row in reader:
287+
_THAI_DICT["word"].append(row["word"])
288+
_THAI_DICT["meaning"].append(row["meaning"])
282289

283290
return _THAI_DICT
284291

@@ -293,18 +300,20 @@ def thai_wsd_dict() -> dict:
293300
:rtype: dict
294301
"""
295302
global _THAI_WSD_DICT
296-
if not _THAI_WSD_DICT:
297-
_thai_wsd = thai_dict()
298-
_THAI_WSD_DICT = {"word": [], "meaning": []}
299-
for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
300-
_all_value = list(eval(j).values())
301-
_use = []
302-
for k in _all_value:
303-
_use.extend(k)
304-
_use = list(set(_use))
305-
if len(_use) > 1:
306-
_THAI_WSD_DICT["word"].append(i)
307-
_THAI_WSD_DICT["meaning"].append(_use)
303+
if _THAI_WSD_DICT:
304+
return _THAI_WSD_DICT
305+
306+
thai_wsd = thai_dict()
307+
_THAI_WSD_DICT = {"word": [], "meaning": []}
308+
for i, j in zip(thai_wsd["word"], thai_wsd["meaning"]):
309+
all_value = list(ast.literal_eval(j).values())
310+
use = []
311+
for k in all_value:
312+
use.extend(k)
313+
use = list(set(use))
314+
if len(use) > 1:
315+
_THAI_WSD_DICT["word"].append(i)
316+
_THAI_WSD_DICT["meaning"].append(use)
308317

309318
return _THAI_WSD_DICT
310319

@@ -319,18 +328,23 @@ def thai_synonyms() -> dict:
319328
:rtype: dict
320329
"""
321330
global _THAI_SYNONYMS
322-
if not _THAI_SYNONYMS:
323-
import csv
324-
325-
_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
326-
with open(
327-
get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
328-
) as csvfile:
329-
reader = csv.DictReader(csvfile, delimiter=",")
330-
for row in reader:
331-
_THAI_SYNONYMS["word"].append(row["word"])
332-
_THAI_SYNONYMS["pos"].append(row["pos"])
333-
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
331+
if _THAI_SYNONYMS:
332+
return _THAI_SYNONYMS
333+
334+
import csv
335+
336+
path = get_corpus_path("thai_synonym")
337+
if not path:
338+
return _THAI_SYNONYMS
339+
path = str(path)
340+
341+
_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
342+
with open(path, newline="\n", encoding="utf-8") as csvfile:
343+
reader = csv.DictReader(csvfile, delimiter=",")
344+
for row in reader:
345+
_THAI_SYNONYMS["word"].append(row["word"])
346+
_THAI_SYNONYMS["pos"].append(row["pos"])
347+
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
334348

335349
return _THAI_SYNONYMS
336350

pythainlp/corpus/oscar.py

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,43 +15,51 @@
1515

1616
from pythainlp.corpus import get_corpus_path
1717

18-
_FILENAME = "oscar_icu"
18+
_OSCAR_FILENAME = "oscar_icu"
1919

2020

2121
def word_freqs() -> List[Tuple[str, int]]:
2222
"""
2323
Get word frequency from OSCAR Corpus (words tokenized using ICU)
2424
"""
25-
word_freqs = []
26-
_path = get_corpus_path(_FILENAME)
27-
with open(_path, "r", encoding="utf-8-sig") as f:
28-
_data = list(f.readlines())
29-
del _data[0]
30-
for line in _data:
31-
_temp = line.strip().split(",")
32-
if len(_temp) >= 2:
33-
if _temp[0] != " " and '"' not in _temp[0]:
34-
word_freqs.append((_temp[0], int(_temp[1])))
35-
elif _temp[0] == " ":
36-
word_freqs.append(("<s/>", int(_temp[1])))
37-
38-
return word_freqs
39-
40-
41-
def unigram_word_freqs() -> defaultdict:
25+
freqs: list[tuple[str, int]] = []
26+
path = get_corpus_path(_OSCAR_FILENAME)
27+
if not path:
28+
return freqs
29+
path = str(path)
30+
31+
with open(path, "r", encoding="utf-8-sig") as f:
32+
lines = list(f.readlines())
33+
del lines[0]
34+
for line in lines:
35+
temp = line.strip().split(",")
36+
if len(temp) >= 2:
37+
if temp[0] != " " and '"' not in temp[0]:
38+
freqs.append((temp[0], int(temp[1])))
39+
elif temp[0] == " ":
40+
freqs.append(("<s/>", int(temp[1])))
41+
42+
return freqs
43+
44+
45+
def unigram_word_freqs() -> dict[str, int]:
4246
"""
4347
Get unigram word frequency from OSCAR Corpus (words tokenized using ICU)
4448
"""
45-
_path = get_corpus_path(_FILENAME)
46-
_word_freqs = defaultdict(int)
47-
with open(_path, "r", encoding="utf-8-sig") as fh:
48-
_data = list(fh.readlines())
49-
del _data[0]
50-
for i in _data:
51-
_temp = i.strip().split(",")
52-
if _temp[0] != " " and '"' not in _temp[0]:
53-
_word_freqs[_temp[0]] = int(_temp[-1])
54-
elif _temp[0] == " ":
55-
_word_freqs["<s/>"] = int(_temp[-1])
56-
57-
return _word_freqs
49+
freqs: dict[str, int] = defaultdict(int)
50+
path = get_corpus_path(_OSCAR_FILENAME)
51+
if not path:
52+
return freqs
53+
path = str(path)
54+
55+
with open(path, "r", encoding="utf-8-sig") as fh:
56+
lines = list(fh.readlines())
57+
del lines[0]
58+
for i in lines:
59+
temp = i.strip().split(",")
60+
if temp[0] != " " and '"' not in temp[0]:
61+
freqs[temp[0]] = int(temp[-1])
62+
elif temp[0] == " ":
63+
freqs["<s/>"] = int(temp[-1])
64+
65+
return freqs

pythainlp/corpus/th_en_translit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def get_transliteration_dict() -> defaultdict:
2828
"""
2929
Get Thai to English transliteration dictionary.
3030
31-
The returned dict is in defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format.
31+
The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.
3232
"""
3333
path = path_pythainlp_corpus(_FILE_NAME)
3434
if not path:
@@ -38,7 +38,7 @@ def get_transliteration_dict() -> defaultdict:
3838
)
3939

4040
# use list, as one word can have multiple transliterations.
41-
trans_dict = defaultdict(
41+
trans_dict: defaultdict[str, dict[str, list]] = defaultdict(
4242
lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
4343
)
4444
try:
@@ -61,11 +61,11 @@ def get_transliteration_dict() -> defaultdict:
6161
en_follow_rtgs
6262
)
6363

64-
except ValueError:
64+
except ValueError as exc:
6565
raise ValueError(
66-
f"Unable to parse {_FILE_NAME}."
66+
f"Unable to parse {_FILE_NAME}. "
6767
f"Make sure it is a 3-column tab-separated file with header."
68-
)
68+
) from exc
6969
else:
7070
return trans_dict
7171

pythainlp/corpus/tnc.py

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,20 @@
66
"""
77

88
__all__ = [
9-
"word_freqs",
10-
"unigram_word_freqs",
119
"bigram_word_freqs",
1210
"trigram_word_freqs",
11+
"unigram_word_freqs",
12+
"word_freqs",
1313
]
1414

1515
from collections import defaultdict
1616
from typing import List, Tuple
1717

1818
from pythainlp.corpus import get_corpus, get_corpus_path
1919

20-
_FILENAME = "tnc_freq.txt"
21-
_BIGRAM = "tnc_bigram_word_freqs"
22-
_TRIGRAM = "tnc_trigram_word_freqs"
20+
_UNIGRAM_FILENAME = "tnc_freq.txt"
21+
_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
22+
_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"
2323

2424

2525
def word_freqs() -> List[Tuple[str, int]]:
@@ -30,53 +30,61 @@ def word_freqs() -> List[Tuple[str, int]]:
3030
3131
Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
3232
"""
33-
lines = list(get_corpus(_FILENAME))
34-
word_freqs = []
33+
freqs: list[tuple[str, int]] = []
34+
lines = list(get_corpus(_UNIGRAM_FILENAME))
3535
for line in lines:
3636
word_freq = line.split("\t")
3737
if len(word_freq) >= 2:
38-
word_freqs.append((word_freq[0], int(word_freq[1])))
38+
freqs.append((word_freq[0], int(word_freq[1])))
3939

40-
return word_freqs
40+
return freqs
4141

4242

43-
def unigram_word_freqs() -> defaultdict:
43+
def unigram_word_freqs() -> dict[str, int]:
4444
"""
4545
Get unigram word frequency from Thai National Corpus (TNC)
4646
"""
47-
lines = list(get_corpus(_FILENAME))
48-
_word_freqs = defaultdict(int)
47+
freqs: dict[str, int] = defaultdict(int)
48+
lines = list(get_corpus(_UNIGRAM_FILENAME))
4949
for i in lines:
5050
_temp = i.strip().split(" ")
5151
if len(_temp) >= 2:
52-
_word_freqs[_temp[0]] = int(_temp[-1])
52+
freqs[_temp[0]] = int(_temp[-1])
5353

54-
return _word_freqs
54+
return freqs
5555

5656

57-
def bigram_word_freqs() -> defaultdict:
57+
def bigram_word_freqs() -> dict[Tuple[str, str], int]:
5858
"""
5959
Get bigram word frequency from Thai National Corpus (TNC)
6060
"""
61-
_path = get_corpus_path(_BIGRAM)
62-
_word_freqs = defaultdict(int)
63-
with open(_path, "r", encoding="utf-8-sig") as fh:
61+
freqs: dict[tuple[str, str], int] = defaultdict(int)
62+
path = get_corpus_path(_BIGRAM_CORPUS_NAME)
63+
if not path:
64+
return freqs
65+
path = str(path)
66+
67+
with open(path, "r", encoding="utf-8-sig") as fh:
6468
for i in fh.readlines():
65-
_temp = i.strip().split(" ")
66-
_word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
69+
temp = i.strip().split(" ")
70+
freqs[(temp[0], temp[1])] = int(temp[-1])
6771

68-
return _word_freqs
72+
return freqs
6973

7074

71-
def trigram_word_freqs() -> defaultdict:
75+
def trigram_word_freqs() -> dict[Tuple[str, str, str], int]:
7276
"""
7377
Get trigram word frequency from Thai National Corpus (TNC)
7478
"""
75-
_path = get_corpus_path(_TRIGRAM)
76-
_word_freqs = defaultdict(int)
77-
with open(_path, "r", encoding="utf-8-sig") as fh:
79+
freqs: dict[tuple[str, str, str], int] = defaultdict(int)
80+
path = get_corpus_path(_TRIGRAM_CORPUS_NAME)
81+
if not path:
82+
return freqs
83+
path = str(path)
84+
85+
with open(path, "r", encoding="utf-8-sig") as fh:
7886
for i in fh.readlines():
79-
_temp = i.strip().split(" ")
80-
_word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
87+
temp = i.strip().split(" ")
88+
freqs[(temp[0], temp[1], temp[2])] = int(temp[-1])
8189

82-
return _word_freqs
90+
return freqs

0 commit comments

Comments
 (0)