Skip to content

Commit cf6997f

Browse files
authored
Merge pull request #968 from bact/add-cli-tests
Make CLI able to handle Unicode characters output on Windows console
2 parents 974b153 + 252e64e commit cf6997f

File tree

10 files changed

+110
-80
lines changed

10 files changed

+110
-80
lines changed

.github/workflows/unittest.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ jobs:
7272
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
7373
run: pip install -r docker_requirements.txt
7474
- name: Install PyThaiNLP
75+
env:
76+
PYTHONIOENCODING: utf-8
7577
run: pip install .
7678
# If you want to install a safe small set of optional dependencies, use:
7779
# pip install .[compact]

pythainlp/cli/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@
33
# SPDX-License-Identifier: Apache-2.0
44
"""Command line helpers."""
55

6+
import io
67
import sys
78
from argparse import ArgumentParser
89

10+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
11+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")
12+
913
# a command should start with a verb when possible
1014
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
1115

pythainlp/cli/benchmark.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pythainlp import cli
1313
from pythainlp.benchmarks import word_tokenization
14+
from pythainlp.tools import safe_print
1415

1516

1617
def _read_file(path):
@@ -81,7 +82,7 @@ def __init__(self, name, argv):
8182
expected
8283
), "Input and test files do not have the same number of samples"
8384

84-
print(
85+
safe_print(
8586
"Benchmarking %s against %s with %d samples in total"
8687
% (args.input_file, args.test_file, len(actual))
8788
)
@@ -121,12 +122,12 @@ def __init__(self, name, argv):
121122
/ statistics["word_level:total_words_in_ref_sample"]
122123
)
123124

124-
print("============== Benchmark Result ==============")
125+
safe_print("============== Benchmark Result ==============")
125126

126127
for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
127128
c = f"char_level:{c}"
128129
v = statistics[c]
129-
print(f"{c:>40s} {v:.4f}")
130+
safe_print(f"{c:>40s} {v:.4f}")
130131

131132
for c in [
132133
"total_words_in_sample",
@@ -137,20 +138,20 @@ def __init__(self, name, argv):
137138
]:
138139
c = f"word_level:{c}"
139140
v = statistics[c]
140-
print(f"{c:>40s} {v:.4f}")
141+
safe_print(f"{c:>40s} {v:.4f}")
141142

142143
if args.save_details:
143144
dir_name = os.path.dirname(args.input_file)
144145
file_name = args.input_file.split("/")[-1].split(".")[0]
145146

146147
res_path = "%s/eval-%s.yml" % (dir_name, file_name)
147-
print("Evaluation result is saved to %s" % res_path)
148+
safe_print("Evaluation result is saved to %s" % res_path)
148149

149150
with open(res_path, "w", encoding="utf-8") as outfile:
150151
yaml.dump(statistics, outfile, default_flow_style=False)
151152

152153
res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
153-
print("Details of comparisons is saved to %s" % res_path)
154+
safe_print("Details of comparisons is saved to %s" % res_path)
154155

155156
with open(res_path, "w", encoding="utf-8") as f:
156157
samples = []
@@ -160,7 +161,12 @@ def __init__(self, name, argv):
160161
del r["actual"]
161162

162163
samples.append(
163-
{"metrics": r, "expected": expected, "actual": actual, "id": i}
164+
{
165+
"metrics": r,
166+
"expected": expected,
167+
"actual": actual,
168+
"id": i,
169+
}
164170
)
165171

166172
details = {"metrics": statistics, "samples": samples}

pythainlp/cli/data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
Command line for PyThaiNLP's dataset/corpus management.
66
"""
7+
78
import argparse
89

910
from pythainlp import corpus

pythainlp/cli/soundex.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
77
It takes input text from the command line.
88
"""
9+
910
import argparse
1011

1112
from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex
13+
from pythainlp.tools import safe_print
1214

1315

1416
class App:
@@ -47,4 +49,5 @@ def __init__(self, argv):
4749
args = parser.parse_args(argv[2:])
4850

4951
sdx = soundex(args.text, engine=args.algorithm)
50-
print(sdx)
52+
53+
safe_print(sdx)

pythainlp/cli/tag.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
"""
55
Command line for PyThaiNLP's taggers.
66
"""
7+
78
import argparse
89

910
from pythainlp import cli
1011
from pythainlp.tag import pos_tag
12+
from pythainlp.tools import safe_print
1113

1214

1315
class SubAppBase:
@@ -34,7 +36,7 @@ def __init__(self, name, argv):
3436
result = self.run(tokens)
3537

3638
for word, tag in result:
37-
print(word, "/", tag)
39+
safe_print(word + " / " + tag)
3840

3941

4042
class POSTaggingApp(SubAppBase):

pythainlp/cli/tokenize.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
subword_tokenize,
1717
word_tokenize,
1818
)
19+
from pythainlp.tools import safe_print
1920

2021
DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
2122
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
@@ -71,7 +72,7 @@ def __init__(self, name, argv):
7172
engine=args.algorithm,
7273
keep_whitespace=args.keep_whitespace,
7374
)
74-
print(args.separator.join(result) + args.separator)
75+
safe_print(args.separator.join(result) + args.separator)
7576

7677

7778
class WordTokenizationApp(SubAppBase):
@@ -144,4 +145,4 @@ def __init__(self, argv):
144145
elif token_type.startswith("se"):
145146
SentenceTokenizationApp("sent", argv)
146147
else:
147-
print(f"Token type not available: {token_type}")
148+
safe_print(f"Token type not available: {token_type}")

tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Names of module to be tested
1313
test_packages: list[str] = [
1414
"tests.test_ancient",
15-
# "tests.test_cli",
15+
"tests.test_cli",
1616
# "tests.test_corpus",
1717
"tests.test_morpheme",
1818
"tests.test_soundex",

tests/test_cli.py

Lines changed: 27 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44

55
import unittest
66
from argparse import ArgumentError
7-
from types import ModuleType
87

98
from pythainlp import __main__, cli
9+
from pythainlp.cli.data import App as DataApp
10+
from pythainlp.cli.soundex import App as SoundexApp
11+
from pythainlp.cli.tag import App as TagApp
12+
from pythainlp.cli.tokenize import App as TokenizeApp
1013

1114

1215
class CliTestCase(unittest.TestCase):
@@ -26,68 +29,40 @@ def test_cli_main(self):
2629

2730
self.assertIsNone(__main__.main(["thainlp", "data", "path"]))
2831

29-
def test_cli_benchmark(self):
30-
self.assertIsInstance(getattr(cli, "benchmark"), ModuleType)
31-
32-
with self.assertRaises(SystemExit) as ex:
33-
cli.data.App(["thainlp", "benchmark"])
34-
self.assertEqual(ex.exception.code, 2)
35-
36-
self.assertIsNotNone(
37-
cli.benchmark.App(
38-
[
39-
"thainlp",
40-
"benchmark",
41-
"word-tokenization",
42-
"--input-file",
43-
"./tests/data/input.txt",
44-
"--test-file",
45-
"./tests/data/test.txt",
46-
"--save-details",
47-
]
48-
)
49-
)
50-
5132
def test_cli_data(self):
52-
self.assertIsInstance(getattr(cli, "data"), ModuleType)
33+
self.assertTrue(hasattr(cli, "data"))
5334

5435
with self.assertRaises(SystemExit) as ex:
55-
cli.data.App(["thainlp", "data"])
36+
DataApp(["thainlp", "data"])
5637
self.assertEqual(ex.exception.code, 2)
5738

58-
self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"]))
59-
self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"]))
60-
self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"]))
61-
self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"]))
62-
self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"]))
63-
self.assertIsNotNone(
64-
cli.data.App(["thainlp", "data", "get", "NOT_EXIST"])
65-
)
66-
self.assertIsNotNone(
67-
cli.data.App(["thainlp", "data", "info", "NOT_EXIST"])
68-
)
69-
self.assertIsNotNone(
70-
cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"])
71-
)
39+
self.assertIsNotNone(DataApp(["thainlp", "data", "catalog"]))
40+
self.assertIsNotNone(DataApp(["thainlp", "data", "path"]))
41+
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "test"]))
42+
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "test"]))
43+
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "test"]))
44+
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "NOT_EXIST"]))
45+
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "NOT_EXIST"]))
46+
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "NOT_EXIST"]))
7247

7348
def test_cli_soundex(self):
74-
self.assertIsInstance(getattr(cli, "soundex"), ModuleType)
49+
self.assertTrue(hasattr(cli, "soundex"))
7550

7651
with self.assertRaises(SystemExit) as ex:
77-
cli.data.App(["thainlp", "soundex"])
52+
DataApp(["thainlp", "soundex"])
7853
self.assertEqual(ex.exception.code, 2)
7954

80-
self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"]))
55+
self.assertIsNotNone(SoundexApp(["thainlp", "soundex", "ทดสอบ"]))
8156

8257
def test_cli_tag(self):
83-
self.assertIsInstance(getattr(cli, "tag"), ModuleType)
58+
self.assertTrue(hasattr(cli, "tag"))
8459

8560
with self.assertRaises(SystemExit) as ex:
86-
cli.data.App(["thainlp", "tag"])
61+
DataApp(["thainlp", "tag"])
8762
self.assertEqual(ex.exception.code, 2)
8863

8964
self.assertIsNotNone(
90-
cli.tag.App(
65+
TagApp(
9166
[
9267
"thainlp",
9368
"tag",
@@ -99,7 +74,7 @@ def test_cli_tag(self):
9974
)
10075
)
10176
self.assertIsNotNone(
102-
cli.tag.App(
77+
TagApp(
10378
[
10479
"thainlp",
10580
"tag",
@@ -112,17 +87,17 @@ def test_cli_tag(self):
11287
)
11388

11489
def test_cli_tokenize(self):
115-
self.assertIsInstance(getattr(cli, "tokenize"), ModuleType)
90+
self.assertTrue(hasattr(cli, "tokenize"))
11691

11792
with self.assertRaises(SystemExit) as ex:
118-
cli.data.App(["thainlp", "tokenize"])
93+
DataApp(["thainlp", "tokenize"])
11994
self.assertEqual(ex.exception.code, 2)
12095

12196
self.assertIsNotNone(
122-
cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
97+
TokenizeApp(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
12398
)
12499
self.assertIsNotNone(
125-
cli.tokenize.App(
100+
TokenizeApp(
126101
[
127102
"thainlp",
128103
"tokenize",
@@ -134,7 +109,7 @@ def test_cli_tokenize(self):
134109
)
135110
)
136111
self.assertIsNotNone(
137-
cli.tokenize.App(
112+
TokenizeApp(
138113
[
139114
"thainlp",
140115
"tokenize",
@@ -147,7 +122,7 @@ def test_cli_tokenize(self):
147122
)
148123
)
149124
self.assertIsNotNone(
150-
cli.tokenize.App(
125+
TokenizeApp(
151126
[
152127
"thainlp",
153128
"tokenize",
@@ -161,19 +136,3 @@ def test_cli_tokenize(self):
161136
]
162137
)
163138
)
164-
self.assertIsNotNone(
165-
cli.tokenize.App(
166-
[
167-
"thainlp",
168-
"tokenize",
169-
"sent",
170-
"-s",
171-
"|",
172-
(
173-
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
174-
"กระสุนสำหรับสมองของคุณวันนี้"
175-
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
176-
),
177-
]
178-
)
179-
)

tests/testx_cli.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import unittest
6+
7+
from pythainlp import __main__, cli
8+
from pythainlp.cli.benchmark import App as BenchmarkApp
9+
from pythainlp.cli.data import App as DataApp
10+
from pythainlp.cli.tokenize import App as TokenizeApp
11+
12+
13+
class CliTestCaseX(unittest.TestCase):
14+
def test_cli_benchmark(self):
15+
self.assertTrue(hasattr(cli, "benchmark"))
16+
17+
with self.assertRaises(SystemExit) as ex:
18+
DataApp(["thainlp", "benchmark"])
19+
self.assertEqual(ex.exception.code, 2)
20+
21+
self.assertIsNotNone(
22+
BenchmarkApp(
23+
[
24+
"thainlp",
25+
"benchmark",
26+
"word-tokenization",
27+
"--input-file",
28+
"./tests/data/input.txt",
29+
"--test-file",
30+
"./tests/data/test.txt",
31+
"--save-details",
32+
]
33+
)
34+
)
35+
36+
def test_cli_tokenize(self):
37+
self.assertIsNotNone(
38+
TokenizeApp(
39+
[
40+
"thainlp",
41+
"tokenize",
42+
"sent",
43+
"-s",
44+
"|",
45+
(
46+
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
47+
"กระสุนสำหรับสมองของคุณวันนี้"
48+
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
49+
),
50+
]
51+
)
52+
)

0 commit comments

Comments
 (0)