PyThaiNLP · bact · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -72,6 +72,8 @@ jobs:
         SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
       run: pip install -r docker_requirements.txt
     - name: Install PyThaiNLP
+      env:
+        PYTHONIOENCODING: utf-8
       run: pip install .
       # If you want to install a safe small set of optional dependencies, use:
       # pip install .[compact]

diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py
@@ -3,9 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Command line helpers."""
 
+import io
 import sys
 from argparse import ArgumentParser
 
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")
+
 # a command should start with a verb when possible
 COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
 

diff --git a/pythainlp/cli/benchmark.py b/pythainlp/cli/benchmark.py
@@ -11,6 +11,7 @@
 
 from pythainlp import cli
 from pythainlp.benchmarks import word_tokenization
+from pythainlp.tools import safe_print
 
 
 def _read_file(path):
@@ -81,7 +82,7 @@ def __init__(self, name, argv):
             expected
         ), "Input and test files do not have the same number of samples"
 
-        print(
+        safe_print(
             "Benchmarking %s against %s with %d samples in total"
             % (args.input_file, args.test_file, len(actual))
         )
@@ -121,12 +122,12 @@ def __init__(self, name, argv):
             / statistics["word_level:total_words_in_ref_sample"]
         )
 
-        print("============== Benchmark Result ==============")
+        safe_print("============== Benchmark Result ==============")
 
         for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
             c = f"char_level:{c}"
             v = statistics[c]
-            print(f"{c:>40s} {v:.4f}")
+            safe_print(f"{c:>40s} {v:.4f}")
 
         for c in [
             "total_words_in_sample",
@@ -137,20 +138,20 @@ def __init__(self, name, argv):
         ]:
             c = f"word_level:{c}"
             v = statistics[c]
-            print(f"{c:>40s} {v:.4f}")
+            safe_print(f"{c:>40s} {v:.4f}")
 
         if args.save_details:
             dir_name = os.path.dirname(args.input_file)
             file_name = args.input_file.split("/")[-1].split(".")[0]
 
             res_path = "%s/eval-%s.yml" % (dir_name, file_name)
-            print("Evaluation result is saved to %s" % res_path)
+            safe_print("Evaluation result is saved to %s" % res_path)
 
             with open(res_path, "w", encoding="utf-8") as outfile:
                 yaml.dump(statistics, outfile, default_flow_style=False)
 
             res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
-            print("Details of comparisons is saved to %s" % res_path)
+            safe_print("Details of comparisons is saved to %s" % res_path)
 
             with open(res_path, "w", encoding="utf-8") as f:
                 samples = []
@@ -160,7 +161,12 @@ def __init__(self, name, argv):
                     del r["actual"]
 
                     samples.append(
-                        {"metrics": r, "expected": expected, "actual": actual, "id": i}
+                        {
+                            "metrics": r,
+                            "expected": expected,
+                            "actual": actual,
+                            "id": i,
+                        }
                     )
 
                 details = {"metrics": statistics, "samples": samples}

diff --git a/pythainlp/cli/data.py b/pythainlp/cli/data.py
@@ -4,6 +4,7 @@
 """
 Command line for PyThaiNLP's dataset/corpus management.
 """
+
 import argparse
 
 from pythainlp import corpus

diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py
@@ -6,9 +6,11 @@
 
 It takes input text from the command line.
 """
+
 import argparse
 
 from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex
+from pythainlp.tools import safe_print
 
 
 class App:
@@ -47,4 +49,5 @@ def __init__(self, argv):
         args = parser.parse_args(argv[2:])
 
         sdx = soundex(args.text, engine=args.algorithm)
-        print(sdx)
+
+        safe_print(sdx)
diff --git a/pythainlp/cli/tag.py b/pythainlp/cli/tag.py
@@ -4,10 +4,12 @@
 """
 Command line for PyThaiNLP's taggers.
 """
+
 import argparse
 
 from pythainlp import cli
 from pythainlp.tag import pos_tag
+from pythainlp.tools import safe_print
 
 
 class SubAppBase:
@@ -34,7 +36,7 @@ def __init__(self, name, argv):
         result = self.run(tokens)
 
         for word, tag in result:
-            print(word, "/", tag)
+            safe_print(word + " / " + tag)
 
 
 class POSTaggingApp(SubAppBase):

diff --git a/pythainlp/cli/tokenize.py b/pythainlp/cli/tokenize.py
@@ -16,6 +16,7 @@
     subword_tokenize,
     word_tokenize,
 )
+from pythainlp.tools import safe_print
 
 DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
 DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
@@ -71,7 +72,7 @@ def __init__(self, name, argv):
             engine=args.algorithm,
             keep_whitespace=args.keep_whitespace,
         )
-        print(args.separator.join(result) + args.separator)
+        safe_print(args.separator.join(result) + args.separator)
 
 
 class WordTokenizationApp(SubAppBase):
@@ -144,4 +145,4 @@ def __init__(self, argv):
         elif token_type.startswith("se"):
             SentenceTokenizationApp("sent", argv)
         else:
-            print(f"Token type not available: {token_type}")
+            safe_print(f"Token type not available: {token_type}")
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -12,7 +12,7 @@
 # Names of module to be tested
 test_packages: list[str] = [
     "tests.test_ancient",
-    # "tests.test_cli",
+    "tests.test_cli",
     # "tests.test_corpus",
     "tests.test_morpheme",
     "tests.test_soundex",

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -4,9 +4,12 @@
 
 import unittest
 from argparse import ArgumentError
-from types import ModuleType
 
 from pythainlp import __main__, cli
+from pythainlp.cli.data import App as DataApp
+from pythainlp.cli.soundex import App as SoundexApp
+from pythainlp.cli.tag import App as TagApp
+from pythainlp.cli.tokenize import App as TokenizeApp
 
 
 class CliTestCase(unittest.TestCase):
@@ -26,68 +29,40 @@ def test_cli_main(self):
 
         self.assertIsNone(__main__.main(["thainlp", "data", "path"]))
 
-    def test_cli_benchmark(self):
-        self.assertIsInstance(getattr(cli, "benchmark"), ModuleType)
-
-        with self.assertRaises(SystemExit) as ex:
-            cli.data.App(["thainlp", "benchmark"])
-        self.assertEqual(ex.exception.code, 2)
-
-        self.assertIsNotNone(
-            cli.benchmark.App(
-                [
-                    "thainlp",
-                    "benchmark",
-                    "word-tokenization",
-                    "--input-file",
-                    "./tests/data/input.txt",
-                    "--test-file",
-                    "./tests/data/test.txt",
-                    "--save-details",
-                ]
-            )
-        )
-
     def test_cli_data(self):
-        self.assertIsInstance(getattr(cli, "data"), ModuleType)
+        self.assertTrue(hasattr(cli, "data"))
 
         with self.assertRaises(SystemExit) as ex:
-            cli.data.App(["thainlp", "data"])
+            DataApp(["thainlp", "data"])
         self.assertEqual(ex.exception.code, 2)
 
-        self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"]))
-        self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"]))
-        self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"]))
-        self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"]))
-        self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"]))
-        self.assertIsNotNone(
-            cli.data.App(["thainlp", "data", "get", "NOT_EXIST"])
-        )
-        self.assertIsNotNone(
-            cli.data.App(["thainlp", "data", "info", "NOT_EXIST"])
-        )
-        self.assertIsNotNone(
-            cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"])
-        )
+        self.assertIsNotNone(DataApp(["thainlp", "data", "catalog"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "path"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "get", "test"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "info", "test"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "test"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "get", "NOT_EXIST"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "info", "NOT_EXIST"]))
+        self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "NOT_EXIST"]))
 
     def test_cli_soundex(self):
-        self.assertIsInstance(getattr(cli, "soundex"), ModuleType)
+        self.assertTrue(hasattr(cli, "soundex"))
 
         with self.assertRaises(SystemExit) as ex:
-            cli.data.App(["thainlp", "soundex"])
+            DataApp(["thainlp", "soundex"])
         self.assertEqual(ex.exception.code, 2)
 
-        self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"]))
+        self.assertIsNotNone(SoundexApp(["thainlp", "soundex", "ทดสอบ"]))
 
     def test_cli_tag(self):
-        self.assertIsInstance(getattr(cli, "tag"), ModuleType)
+        self.assertTrue(hasattr(cli, "tag"))
 
         with self.assertRaises(SystemExit) as ex:
-            cli.data.App(["thainlp", "tag"])
+            DataApp(["thainlp", "tag"])
         self.assertEqual(ex.exception.code, 2)
 
         self.assertIsNotNone(
-            cli.tag.App(
+            TagApp(
                 [
                     "thainlp",
                     "tag",
@@ -99,7 +74,7 @@ def test_cli_tag(self):
             )
         )
         self.assertIsNotNone(
-            cli.tag.App(
+            TagApp(
                 [
                     "thainlp",
                     "tag",
@@ -112,17 +87,17 @@ def test_cli_tag(self):
         )
 
     def test_cli_tokenize(self):
-        self.assertIsInstance(getattr(cli, "tokenize"), ModuleType)
+        self.assertTrue(hasattr(cli, "tokenize"))
 
         with self.assertRaises(SystemExit) as ex:
-            cli.data.App(["thainlp", "tokenize"])
+            DataApp(["thainlp", "tokenize"])
         self.assertEqual(ex.exception.code, 2)
 
         self.assertIsNotNone(
-            cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
+            TokenizeApp(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
         )
         self.assertIsNotNone(
-            cli.tokenize.App(
+            TokenizeApp(
                 [
                     "thainlp",
                     "tokenize",
@@ -134,7 +109,7 @@ def test_cli_tokenize(self):
             )
         )
         self.assertIsNotNone(
-            cli.tokenize.App(
+            TokenizeApp(
                 [
                     "thainlp",
                     "tokenize",
@@ -147,7 +122,7 @@ def test_cli_tokenize(self):
             )
         )
         self.assertIsNotNone(
-            cli.tokenize.App(
+            TokenizeApp(
                 [
                     "thainlp",
                     "tokenize",
@@ -161,19 +136,3 @@ def test_cli_tokenize(self):
                 ]
             )
         )
-        self.assertIsNotNone(
-            cli.tokenize.App(
-                [
-                    "thainlp",
-                    "tokenize",
-                    "sent",
-                    "-s",
-                    "|",
-                    (
-                        "ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
-                        "กระสุนสำหรับสมองของคุณวันนี้"
-                        "แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
-                    ),
-                ]
-            )
-        )
diff --git a/tests/testx_cli.py b/tests/testx_cli.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+from pythainlp import __main__, cli
+from pythainlp.cli.benchmark import App as BenchmarkApp
+from pythainlp.cli.data import App as DataApp
+from pythainlp.cli.tokenize import App as TokenizeApp
+
+
+class CliTestCaseX(unittest.TestCase):
+    def test_cli_benchmark(self):
+        self.assertTrue(hasattr(cli, "benchmark"))
+
+        with self.assertRaises(SystemExit) as ex:
+            DataApp(["thainlp", "benchmark"])
+        self.assertEqual(ex.exception.code, 2)
+
+        self.assertIsNotNone(
+            BenchmarkApp(
+                [
+                    "thainlp",
+                    "benchmark",
+                    "word-tokenization",
+                    "--input-file",
+                    "./tests/data/input.txt",
+                    "--test-file",
+                    "./tests/data/test.txt",
+                    "--save-details",
+                ]
+            )
+        )
+
+    def test_cli_tokenize(self):
+        self.assertIsNotNone(
+            TokenizeApp(
+                [
+                    "thainlp",
+                    "tokenize",
+                    "sent",
+                    "-s",
+                    "|",
+                    (
+                        "ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
+                        "กระสุนสำหรับสมองของคุณวันนี้"
+                        "แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
+                    ),
+                ]
+            )
+        )
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     """
     Command line for PyThaiNLP's dataset/corpus management.
     """
     import argparse
     from pythainlp import corpus
@@ Expand Down @@