Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ jobs:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: pip install -r docker_requirements.txt
- name: Install PyThaiNLP
env:
PYTHONIOENCODING: utf-8
run: pip install .
# If you want to install a safe small set of optional dependencies, use:
# pip install .[compact]
Expand Down
4 changes: 4 additions & 0 deletions pythainlp/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
# SPDX-License-Identifier: Apache-2.0
"""Command line helpers."""

import io
import sys
from argparse import ArgumentParser

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")

# a command should start with a verb when possible
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])

Expand Down
20 changes: 13 additions & 7 deletions pythainlp/cli/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pythainlp import cli
from pythainlp.benchmarks import word_tokenization
from pythainlp.tools import safe_print


def _read_file(path):
Expand Down Expand Up @@ -81,7 +82,7 @@ def __init__(self, name, argv):
expected
), "Input and test files do not have the same number of samples"

print(
safe_print(
"Benchmarking %s against %s with %d samples in total"
% (args.input_file, args.test_file, len(actual))
)
Expand Down Expand Up @@ -121,12 +122,12 @@ def __init__(self, name, argv):
/ statistics["word_level:total_words_in_ref_sample"]
)

print("============== Benchmark Result ==============")
safe_print("============== Benchmark Result ==============")

for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
c = f"char_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")
safe_print(f"{c:>40s} {v:.4f}")

for c in [
"total_words_in_sample",
Expand All @@ -137,20 +138,20 @@ def __init__(self, name, argv):
]:
c = f"word_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")
safe_print(f"{c:>40s} {v:.4f}")

if args.save_details:
dir_name = os.path.dirname(args.input_file)
file_name = args.input_file.split("/")[-1].split(".")[0]

res_path = "%s/eval-%s.yml" % (dir_name, file_name)
print("Evaluation result is saved to %s" % res_path)
safe_print("Evaluation result is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as outfile:
yaml.dump(statistics, outfile, default_flow_style=False)

res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
print("Details of comparisons is saved to %s" % res_path)
safe_print("Details of comparisons is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as f:
samples = []
Expand All @@ -160,7 +161,12 @@ def __init__(self, name, argv):
del r["actual"]

samples.append(
{"metrics": r, "expected": expected, "actual": actual, "id": i}
{
"metrics": r,
"expected": expected,
"actual": actual,
"id": i,
}
)

details = {"metrics": statistics, "samples": samples}
Expand Down
1 change: 1 addition & 0 deletions pythainlp/cli/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
Command line for PyThaiNLP's dataset/corpus management.
"""

import argparse

from pythainlp import corpus
Expand Down
5 changes: 4 additions & 1 deletion pythainlp/cli/soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

It takes input text from the command line.
"""

import argparse

from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex
from pythainlp.tools import safe_print


class App:
Expand Down Expand Up @@ -47,4 +49,5 @@ def __init__(self, argv):
args = parser.parse_args(argv[2:])

sdx = soundex(args.text, engine=args.algorithm)
print(sdx)

safe_print(sdx)
4 changes: 3 additions & 1 deletion pythainlp/cli/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
"""
Command line for PyThaiNLP's taggers.
"""

import argparse

from pythainlp import cli
from pythainlp.tag import pos_tag
from pythainlp.tools import safe_print


class SubAppBase:
Expand All @@ -34,7 +36,7 @@ def __init__(self, name, argv):
result = self.run(tokens)

for word, tag in result:
print(word, "/", tag)
safe_print(word + " / " + tag)


class POSTaggingApp(SubAppBase):
Expand Down
5 changes: 3 additions & 2 deletions pythainlp/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
subword_tokenize,
word_tokenize,
)
from pythainlp.tools import safe_print

DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
Expand Down Expand Up @@ -71,7 +72,7 @@ def __init__(self, name, argv):
engine=args.algorithm,
keep_whitespace=args.keep_whitespace,
)
print(args.separator.join(result) + args.separator)
safe_print(args.separator.join(result) + args.separator)


class WordTokenizationApp(SubAppBase):
Expand Down Expand Up @@ -144,4 +145,4 @@ def __init__(self, argv):
elif token_type.startswith("se"):
SentenceTokenizationApp("sent", argv)
else:
print(f"Token type not available: {token_type}")
safe_print(f"Token type not available: {token_type}")
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# Names of module to be tested
test_packages: list[str] = [
"tests.test_ancient",
# "tests.test_cli",
"tests.test_cli",
# "tests.test_corpus",
"tests.test_morpheme",
"tests.test_soundex",
Expand Down
95 changes: 27 additions & 68 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@

import unittest
from argparse import ArgumentError
from types import ModuleType

from pythainlp import __main__, cli
from pythainlp.cli.data import App as DataApp
from pythainlp.cli.soundex import App as SoundexApp
from pythainlp.cli.tag import App as TagApp
from pythainlp.cli.tokenize import App as TokenizeApp


class CliTestCase(unittest.TestCase):
Expand All @@ -26,68 +29,40 @@ def test_cli_main(self):

self.assertIsNone(__main__.main(["thainlp", "data", "path"]))

def test_cli_benchmark(self):
self.assertIsInstance(getattr(cli, "benchmark"), ModuleType)

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "benchmark"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.benchmark.App(
[
"thainlp",
"benchmark",
"word-tokenization",
"--input-file",
"./tests/data/input.txt",
"--test-file",
"./tests/data/test.txt",
"--save-details",
]
)
)

def test_cli_data(self):
self.assertIsInstance(getattr(cli, "data"), ModuleType)
self.assertTrue(hasattr(cli, "data"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "data"])
DataApp(["thainlp", "data"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"]))
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "get", "NOT_EXIST"])
)
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "info", "NOT_EXIST"])
)
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"])
)
self.assertIsNotNone(DataApp(["thainlp", "data", "catalog"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "path"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "NOT_EXIST"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "NOT_EXIST"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "NOT_EXIST"]))

def test_cli_soundex(self):
self.assertIsInstance(getattr(cli, "soundex"), ModuleType)
self.assertTrue(hasattr(cli, "soundex"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "soundex"])
DataApp(["thainlp", "soundex"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"]))
self.assertIsNotNone(SoundexApp(["thainlp", "soundex", "ทดสอบ"]))

def test_cli_tag(self):
self.assertIsInstance(getattr(cli, "tag"), ModuleType)
self.assertTrue(hasattr(cli, "tag"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "tag"])
DataApp(["thainlp", "tag"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.tag.App(
TagApp(
[
"thainlp",
"tag",
Expand All @@ -99,7 +74,7 @@ def test_cli_tag(self):
)
)
self.assertIsNotNone(
cli.tag.App(
TagApp(
[
"thainlp",
"tag",
Expand All @@ -112,17 +87,17 @@ def test_cli_tag(self):
)

def test_cli_tokenize(self):
self.assertIsInstance(getattr(cli, "tokenize"), ModuleType)
self.assertTrue(hasattr(cli, "tokenize"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "tokenize"])
DataApp(["thainlp", "tokenize"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
TokenizeApp(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -134,7 +109,7 @@ def test_cli_tokenize(self):
)
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -147,7 +122,7 @@ def test_cli_tokenize(self):
)
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -161,19 +136,3 @@ def test_cli_tokenize(self):
]
)
)
self.assertIsNotNone(
cli.tokenize.App(
[
"thainlp",
"tokenize",
"sent",
"-s",
"|",
(
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
"กระสุนสำหรับสมองของคุณวันนี้"
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
),
]
)
)
52 changes: 52 additions & 0 deletions tests/testx_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp import __main__, cli
from pythainlp.cli.benchmark import App as BenchmarkApp
from pythainlp.cli.data import App as DataApp
from pythainlp.cli.tokenize import App as TokenizeApp


class CliTestCaseX(unittest.TestCase):
def test_cli_benchmark(self):
self.assertTrue(hasattr(cli, "benchmark"))

with self.assertRaises(SystemExit) as ex:
DataApp(["thainlp", "benchmark"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
BenchmarkApp(
[
"thainlp",
"benchmark",
"word-tokenization",
"--input-file",
"./tests/data/input.txt",
"--test-file",
"./tests/data/test.txt",
"--save-details",
]
)
)

def test_cli_tokenize(self):
self.assertIsNotNone(
TokenizeApp(
[
"thainlp",
"tokenize",
"sent",
"-s",
"|",
(
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
"กระสุนสำหรับสมองของคุณวันนี้"
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
),
]
)
)