diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index ffa8573a0..c889f6906 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -72,6 +72,8 @@ jobs: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: pip install -r docker_requirements.txt - name: Install PyThaiNLP + env: + PYTHONIOENCODING: utf-8 run: pip install . # If you want to install a safe small set of optional dependencies, use: # pip install .[compact] diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py index 4bacaddf4..e7001f5bb 100644 --- a/pythainlp/cli/__init__.py +++ b/pythainlp/cli/__init__.py @@ -3,9 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 """Command line helpers.""" +import io import sys from argparse import ArgumentParser +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8") +sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8") + # a command should start with a verb when possible COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"]) diff --git a/pythainlp/cli/benchmark.py b/pythainlp/cli/benchmark.py index 721eb3559..2f9eedbd9 100644 --- a/pythainlp/cli/benchmark.py +++ b/pythainlp/cli/benchmark.py @@ -11,6 +11,7 @@ from pythainlp import cli from pythainlp.benchmarks import word_tokenization +from pythainlp.tools import safe_print def _read_file(path): @@ -81,7 +82,7 @@ def __init__(self, name, argv): expected ), "Input and test files do not have the same number of samples" - print( + safe_print( "Benchmarking %s against %s with %d samples in total" % (args.input_file, args.test_file, len(actual)) ) @@ -121,12 +122,12 @@ def __init__(self, name, argv): / statistics["word_level:total_words_in_ref_sample"] ) - print("============== Benchmark Result ==============") + safe_print("============== Benchmark Result ==============") for c in ["tp", "fn", "tn", "fp", "precision", "recall"]: c = f"char_level:{c}" v = statistics[c] - print(f"{c:>40s} {v:.4f}") + safe_print(f"{c:>40s} {v:.4f}") for c in [ "total_words_in_sample", @@ -137,20 +138,20 @@ def __init__(self, name, argv): ]: c = f"word_level:{c}" v = statistics[c] - print(f"{c:>40s} {v:.4f}") + safe_print(f"{c:>40s} {v:.4f}") if args.save_details: dir_name = os.path.dirname(args.input_file) file_name = args.input_file.split("/")[-1].split(".")[0] res_path = "%s/eval-%s.yml" % (dir_name, file_name) - print("Evaluation result is saved to %s" % res_path) + safe_print("Evaluation result is saved to %s" % res_path) with open(res_path, "w", encoding="utf-8") as outfile: yaml.dump(statistics, outfile, default_flow_style=False) res_path = "%s/eval-details-%s.json" % (dir_name, file_name) - print("Details of comparisons is saved to %s" % res_path) + safe_print("Details of comparisons is saved to %s" % res_path) with open(res_path, "w", encoding="utf-8") as f: samples = [] @@ -160,7 +161,12 @@ def __init__(self, name, argv): del r["actual"] samples.append( - {"metrics": r, "expected": expected, "actual": actual, "id": i} + { + "metrics": r, + "expected": expected, + "actual": actual, + "id": i, + } ) details = {"metrics": statistics, "samples": samples} diff --git a/pythainlp/cli/data.py b/pythainlp/cli/data.py index 40bc3175d..7f58b9ced 100644 --- a/pythainlp/cli/data.py +++ b/pythainlp/cli/data.py @@ -4,6 +4,7 @@ """ Command line for PyThaiNLP's dataset/corpus management. """ + import argparse from pythainlp import corpus diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py index 587fd9498..6ef7ed897 100644 --- a/pythainlp/cli/soundex.py +++ b/pythainlp/cli/soundex.py @@ -6,9 +6,11 @@ It takes input text from the command line. """ + import argparse from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex +from pythainlp.tools import safe_print class App: @@ -47,4 +49,5 @@ def __init__(self, argv): args = parser.parse_args(argv[2:]) sdx = soundex(args.text, engine=args.algorithm) - print(sdx) + + safe_print(sdx) diff --git a/pythainlp/cli/tag.py b/pythainlp/cli/tag.py index 6cb0b54b3..e9bc8eeef 100644 --- a/pythainlp/cli/tag.py +++ b/pythainlp/cli/tag.py @@ -4,10 +4,12 @@ """ Command line for PyThaiNLP's taggers. """ + import argparse from pythainlp import cli from pythainlp.tag import pos_tag +from pythainlp.tools import safe_print class SubAppBase: @@ -34,7 +36,7 @@ def __init__(self, name, argv): result = self.run(tokens) for word, tag in result: - print(word, "/", tag) + safe_print(word + " / " + tag) class POSTaggingApp(SubAppBase): diff --git a/pythainlp/cli/tokenize.py b/pythainlp/cli/tokenize.py index 2f4199748..2c8f5ffc5 100644 --- a/pythainlp/cli/tokenize.py +++ b/pythainlp/cli/tokenize.py @@ -16,6 +16,7 @@ subword_tokenize, word_tokenize, ) +from pythainlp.tools import safe_print DEFAULT_SENT_TOKEN_SEPARATOR = "@@" DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/" @@ -71,7 +72,7 @@ def __init__(self, name, argv): engine=args.algorithm, keep_whitespace=args.keep_whitespace, ) - print(args.separator.join(result) + args.separator) + safe_print(args.separator.join(result) + args.separator) class WordTokenizationApp(SubAppBase): @@ -144,4 +145,4 @@ def __init__(self, argv): elif token_type.startswith("se"): SentenceTokenizationApp("sent", argv) else: - print(f"Token type not available: {token_type}") + safe_print(f"Token type not available: {token_type}") diff --git a/tests/__init__.py b/tests/__init__.py index b2a5c96be..2bc8a708b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -12,7 +12,7 @@ # Names of module to be tested test_packages: list[str] = [ "tests.test_ancient", - # "tests.test_cli", + "tests.test_cli", # "tests.test_corpus", "tests.test_morpheme", "tests.test_soundex", diff --git a/tests/test_cli.py b/tests/test_cli.py index b7e3ca3c0..15b4750bb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,9 +4,12 @@ import unittest from argparse import ArgumentError -from types import ModuleType from pythainlp import __main__, cli +from pythainlp.cli.data import App as DataApp +from pythainlp.cli.soundex import App as SoundexApp +from pythainlp.cli.tag import App as TagApp +from pythainlp.cli.tokenize import App as TokenizeApp class CliTestCase(unittest.TestCase): @@ -26,68 +29,40 @@ def test_cli_main(self): self.assertIsNone(__main__.main(["thainlp", "data", "path"])) - def test_cli_benchmark(self): - self.assertIsInstance(getattr(cli, "benchmark"), ModuleType) - - with self.assertRaises(SystemExit) as ex: - cli.data.App(["thainlp", "benchmark"]) - self.assertEqual(ex.exception.code, 2) - - self.assertIsNotNone( - cli.benchmark.App( - [ - "thainlp", - "benchmark", - "word-tokenization", - "--input-file", - "./tests/data/input.txt", - "--test-file", - "./tests/data/test.txt", - "--save-details", - ] - ) - ) - def test_cli_data(self): - self.assertIsInstance(getattr(cli, "data"), ModuleType) + self.assertTrue(hasattr(cli, "data")) with self.assertRaises(SystemExit) as ex: - cli.data.App(["thainlp", "data"]) + DataApp(["thainlp", "data"]) self.assertEqual(ex.exception.code, 2) - self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"])) - self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"])) - self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"])) - self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"])) - self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"])) - self.assertIsNotNone( - cli.data.App(["thainlp", "data", "get", "NOT_EXIST"]) - ) - self.assertIsNotNone( - cli.data.App(["thainlp", "data", "info", "NOT_EXIST"]) - ) - self.assertIsNotNone( - cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"]) - ) + self.assertIsNotNone(DataApp(["thainlp", "data", "catalog"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "path"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "get", "test"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "info", "test"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "test"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "get", "NOT_EXIST"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "info", "NOT_EXIST"])) + self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "NOT_EXIST"])) def test_cli_soundex(self): - self.assertIsInstance(getattr(cli, "soundex"), ModuleType) + self.assertTrue(hasattr(cli, "soundex")) with self.assertRaises(SystemExit) as ex: - cli.data.App(["thainlp", "soundex"]) + DataApp(["thainlp", "soundex"]) self.assertEqual(ex.exception.code, 2) - self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"])) + self.assertIsNotNone(SoundexApp(["thainlp", "soundex", "ทดสอบ"])) def test_cli_tag(self): - self.assertIsInstance(getattr(cli, "tag"), ModuleType) + self.assertTrue(hasattr(cli, "tag")) with self.assertRaises(SystemExit) as ex: - cli.data.App(["thainlp", "tag"]) + DataApp(["thainlp", "tag"]) self.assertEqual(ex.exception.code, 2) self.assertIsNotNone( - cli.tag.App( + TagApp( [ "thainlp", "tag", @@ -99,7 +74,7 @@ def test_cli_tag(self): ) ) self.assertIsNotNone( - cli.tag.App( + TagApp( [ "thainlp", "tag", @@ -112,17 +87,17 @@ def test_cli_tag(self): ) def test_cli_tokenize(self): - self.assertIsInstance(getattr(cli, "tokenize"), ModuleType) + self.assertTrue(hasattr(cli, "tokenize")) with self.assertRaises(SystemExit) as ex: - cli.data.App(["thainlp", "tokenize"]) + DataApp(["thainlp", "tokenize"]) self.assertEqual(ex.exception.code, 2) self.assertIsNotNone( - cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"]) + TokenizeApp(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"]) ) self.assertIsNotNone( - cli.tokenize.App( + TokenizeApp( [ "thainlp", "tokenize", @@ -134,7 +109,7 @@ def test_cli_tokenize(self): ) ) self.assertIsNotNone( - cli.tokenize.App( + TokenizeApp( [ "thainlp", "tokenize", @@ -147,7 +122,7 @@ def test_cli_tokenize(self): ) ) self.assertIsNotNone( - cli.tokenize.App( + TokenizeApp( [ "thainlp", "tokenize", @@ -161,19 +136,3 @@ def test_cli_tokenize(self): ] ) ) - self.assertIsNotNone( - cli.tokenize.App( - [ - "thainlp", - "tokenize", - "sent", - "-s", - "|", - ( - "ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้" - "กระสุนสำหรับสมองของคุณวันนี้" - "แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง" - ), - ] - ) - ) diff --git a/tests/testx_cli.py b/tests/testx_cli.py new file mode 100644 index 000000000..ac2c5a9cb --- /dev/null +++ b/tests/testx_cli.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +import unittest + +from pythainlp import __main__, cli +from pythainlp.cli.benchmark import App as BenchmarkApp +from pythainlp.cli.data import App as DataApp +from pythainlp.cli.tokenize import App as TokenizeApp + + +class CliTestCaseX(unittest.TestCase): + def test_cli_benchmark(self): + self.assertTrue(hasattr(cli, "benchmark")) + + with self.assertRaises(SystemExit) as ex: + DataApp(["thainlp", "benchmark"]) + self.assertEqual(ex.exception.code, 2) + + self.assertIsNotNone( + BenchmarkApp( + [ + "thainlp", + "benchmark", + "word-tokenization", + "--input-file", + "./tests/data/input.txt", + "--test-file", + "./tests/data/test.txt", + "--save-details", + ] + ) + ) + + def test_cli_tokenize(self): + self.assertIsNotNone( + TokenizeApp( + [ + "thainlp", + "tokenize", + "sent", + "-s", + "|", + ( + "ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้" + "กระสุนสำหรับสมองของคุณวันนี้" + "แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง" + ), + ] + ) + )