diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy-docs.yml similarity index 96% rename from .github/workflows/deploy_docs.yml rename to .github/workflows/deploy-docs.yml index 1e7a157e1..8884a1b6c 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -1,4 +1,4 @@ -name: Deploy Docs dev +name: Deploy development documentation on: push: branches: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 82fbd0f77..e50bae84b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,13 +17,13 @@ on: - 'docs/**' jobs: - ruff: - runs-on: ubuntu-24.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Ruff - uses: astral-sh/ruff-action@v1 - with: - src: "./pythainlp" - args: check --verbose --line-length 79 --select C901 + ruff: + runs-on: ubuntu-24.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Ruff + uses: astral-sh/ruff-action@v1 + with: + src: "./pythainlp" + args: check --verbose --line-length 79 --select C901 diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index a5db5d20f..adaccf97e 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -13,7 +13,8 @@ jobs: python-version: [3.8] steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: @@ -21,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + python -m pip install setuptools twine wheel python setup.py sdist bdist_wheel - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index e64011920..90d579dec 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -23,9 +23,9 @@ jobs: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | python -m pip install --upgrade pip - pip install deepcut tltk - SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt - pip install pythainlp[full] + python -m pip install deepcut tltk + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True python -m pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt + python -m pip install pythainlp[full] python -m nltk.downloader omw-1.4 - name: Test run: | diff --git a/.github/workflows/macos-test.yml b/.github/workflows/unittest-macos.yml similarity index 91% rename from .github/workflows/macos-test.yml rename to .github/workflows/unittest-macos.yml index a7516236f..30a2047ca 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/unittest-macos.yml @@ -1,4 +1,4 @@ -name: macOS Unit test and code coverage +name: Unit test and code coverage (MacOS) on: push: @@ -25,8 +25,10 @@ jobs: python-version: [3.8] steps: - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v3 + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: conda-incubator/setup-miniconda@v3 with: python-version: ${{ matrix.python-version }} auto-activate-base: false @@ -64,7 +66,8 @@ jobs: python -m pip cache purge python -m unittest discover if: matrix.os == 'self-hosted' - - shell: bash -l {0} + - name: Show environment + shell: bash -l {0} run: | conda info conda list diff --git a/.github/workflows/windows-test.yml b/.github/workflows/unittest-windows.yml similarity index 86% rename from .github/workflows/windows-test.yml rename to .github/workflows/unittest-windows.yml index 3394f7d40..5f9a71ceb 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/unittest-windows.yml @@ -1,4 +1,4 @@ -name: Windows Unit test and code coverage +name: Unit test and code coverage (Windows) on: push: @@ -25,13 +25,16 @@ jobs: python-version: [3.8] steps: - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v3 + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: conda-incubator/setup-miniconda@v3 with: python-version: ${{ matrix.python-version }} auto-activate-base: true auto-update-conda: true - - shell: powershell + - name: Show environment + shell: powershell run: | conda info conda list diff --git a/.github/workflows/test.yml b/.github/workflows/unitttest-ubuntu.yml similarity index 92% rename from .github/workflows/test.yml rename to .github/workflows/unitttest-ubuntu.yml index a62ff6419..3a94a4e62 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/unitttest-ubuntu.yml @@ -1,4 +1,4 @@ -name: Unit test and code coverage +name: Unit test and code coverage (Ubuntu) on: push: @@ -21,7 +21,8 @@ jobs: python-version: [3.9] steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 4c9a24fd1..5e5e5cf5c 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -1,11 +1,11 @@ scanner: - diff_only: True # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned. - linter: pycodestyle # Other option is flake8 + diff_only: True # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned. + linter: pycodestyle # Other option is flake8 pycodestyle: # Same as scanner.linter value. Other option is flake8 - max-line-length: 100 # Default is 79 in PEP 8 - ignore: # Errors and warnings to ignore - - W504 # line break after binary operator - - E402 # module level import not at top of file - - E731 # do not assign a lambda expression, use a def + max-line-length: 100 # Default is 79 in PEP 8 + ignore: # Errors and warnings to ignore + - W504 # line break after binary operator + - E402 # module level import not at top of file + - E731 # do not assign a lambda expression, use a def no_blank_comment: True # If True, no comment is made on PR without any errors. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 777292a07..c9fa48b5c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,10 +6,9 @@ Please refer to our [Contributor Covenant Code of Conduct](https://github.com/Py ## Issue Report and Discussion -- Discussion: https://github.com/PyThaiNLP/pythainlp/discussions -- GitHub issues (for problems and suggestions): https://github.com/PyThaiNLP/pythainlp/issues -- Facebook group (not specific to PyThaiNLP, for Thai NLP discussion in general): https://www.facebook.com/groups/thainlp - +- Discussion: +- GitHub issues (for problems and suggestions): +- Facebook group (not specific to PyThaiNLP, for Thai NLP discussion in general): ## Code @@ -49,7 +48,6 @@ to manage our branches. and several checks automatically. Click the "Details" link at the end of each check to see what needs to be fixed. - ## Documentation - We use [Sphinx](https://www.sphinx-doc.org/en/master/) to generate API document @@ -57,8 +55,7 @@ automatically from "docstring" comments in source codes. This means the comment section in the source codes is important for the quality of documentation. - A docstring should start with one summary line, end with one line with a full stop (period), then be followed by a blank line before starting a new paragraph. -- A commit to release branches (e.g. `2.2`, `2.1`) with a title **"(build and deploy docs)"** (without quotes) will trigger the system to rebuild the documentation files and upload them to the website https://pythainlp.org/docs. - +- A commit to release branches (e.g. `2.2`, `2.1`) with a title **"(build and deploy docs)"** (without quotes) will trigger the system to rebuild the documentation files and upload them to the website . ## Testing @@ -67,29 +64,33 @@ We use standard Python `unittest`. The test suite is in `tests/` directory. To run unit tests locally together with code coverage test: (from main `pythainlp/` directory) + ```sh coverage run -m unittest discover ``` See code coverage test: + ```sh coverage report ``` Generate code coverage test in HTML (files will be available in `htmlcov/` directory): + ```sh coverage html ``` Make sure the tests pass on both Github Actions and AppVeyor. - ## Releasing + - We use [semantic versioning](https://semver.org/): MAJOR.MINOR.PATCH, with development build suffix: MAJOR.MINOR.PATCH-devBUILD - We use [`bumpversion`](https://github.com/c4urself/bump2version/#installation) to manage versioning. - `bumpversion [major|minor|patch|release|build]` - Example: - ``` + + ```sh #current_version = 2.3.3-dev0 bumpversion build @@ -132,10 +133,11 @@ Make sure the tests pass on both Github Actions and AppVeyor. Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contributors). (Image made with [contributors-img](https://contributors-img.firebaseapp.com)) ### Development Leads + - Wannaphong Phatthiyaphaibun - foundation, distribution and maintenance - Korakot Chaovavanich - initial tokenization and soundex codes - Charin Polpanumas - classification and benchmarking -- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance +- Arthit Suriyawongkul - documentation, refactoring, packaging, distribution, and maintenance - Lalita Lowphansirikul - documentation - Pattarawat Chormai - benchmarking - Peerat Limkonchotiwat @@ -143,18 +145,19 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr - Can Udomcharoenchaikit - documentation and codes ### Maintainers + - Arthit Suriyawongkul - Wannaphong Phatthiyaphaibun - ### Past + - Peeradej Tanruangporn - documentation ## References -- **[Maximum Matching]** -- Manabu Sassano. Deterministic Word Segmentation Using Maximum Matching with Fully Lexicalized Rules. Retrieved from http://www.aclweb.org/anthology/E14-4016 -- **[MetaSound]** -- Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf +- **[Maximum Matching]** -- Manabu Sassano. Deterministic Word Segmentation Using Maximum Matching with Fully Lexicalized Rules. Retrieved from +- **[MetaSound]** -- Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from - **[Thai Character Cluster]** -- T. Teeramunkong, V. Sornlertlamvanich, T. Tanhermhong and W. Chinnan, “Character cluster based Thai information retrieval,” in IRAL '00 Proceedings of the fifth international workshop on on Information retrieval with Asian languages, 2000. - **[Enhanced Thai Character Cluster]** -- Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. “Thai word segmentation using combination of forward and backward longest matching techniques.” In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. - เพ็ญศิริ ลี้ตระกูล. การเลือกประโยคสำคัญในการสรุปความภาษาไทย โดยใช้แบบจำลองแบบลำดับชั้น (Selection of Important Sentences in Thai Text Summarization Using a Hierarchical Model). Retrieved from [http://digi.library.tu.ac.th/thesis/st/0192/](https://digital.library.tu.ac.th/tu_dc/frontend/Info/item/dc:124897) -- **[Thai Discourse Treebank]** -- Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650 +- **[Thai Discourse Treebank]** -- Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: diff --git a/pythainlp/soundex/__init__.py b/pythainlp/soundex/__init__.py index c86b855eb..3799795da 100644 --- a/pythainlp/soundex/__init__.py +++ b/pythainlp/soundex/__init__.py @@ -8,17 +8,17 @@ """ __all__ = [ - "soundex", "lk82", "metasound", - "udom83", "prayut_and_somchaip", + "soundex", + "udom83", ] from pythainlp.soundex.lk82 import lk82 from pythainlp.soundex.metasound import metasound -from pythainlp.soundex.udom83 import udom83 from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip +from pythainlp.soundex.udom83 import udom83 DEFAULT_SOUNDEX_ENGINE = "udom83" diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py index 056c81c46..b47b276c8 100644 --- a/pythainlp/summarize/__init__.py +++ b/pythainlp/summarize/__init__.py @@ -11,4 +11,4 @@ CPE_KMUTT_THAI_SENTENCE_SUM = "mt5-cpe-kmutt-thai-sentence-sum" DEFAULT_KEYWORD_EXTRACTION_ENGINE = "keybert" -from pythainlp.summarize.core import summarize, extract_keywords +from pythainlp.summarize.core import extract_keywords, summarize diff --git a/pythainlp/summarize/core.py b/pythainlp/summarize/core.py index c3da14e4b..4d33073ae 100644 --- a/pythainlp/summarize/core.py +++ b/pythainlp/summarize/core.py @@ -5,12 +5,12 @@ Text summarization and keyword extraction """ -from typing import List, Iterable, Optional, Tuple +from typing import Iterable, List, Optional, Tuple from pythainlp.summarize import ( - DEFAULT_SUMMARIZE_ENGINE, CPE_KMUTT_THAI_SENTENCE_SUM, DEFAULT_KEYWORD_EXTRACTION_ENGINE, + DEFAULT_SUMMARIZE_ENGINE, ) from pythainlp.summarize.freq import FrequencySummarizer from pythainlp.tokenize import sent_tokenize @@ -198,8 +198,8 @@ def rank_by_frequency( tokenizer: str = "newmm", stop_words: Optional[Iterable[str]] = None, ): - from pythainlp.util.keywords import rank from pythainlp.tokenize import word_tokenize + from pythainlp.util.keywords import rank tokens = word_tokenize(text, engine=tokenizer, keep_whitespace=False) diff --git a/pythainlp/summarize/mt5.py b/pythainlp/summarize/mt5.py index de1e6ecbf..e27e8f63a 100644 --- a/pythainlp/summarize/mt5.py +++ b/pythainlp/summarize/mt5.py @@ -5,7 +5,8 @@ Summarization by mT5 model """ from typing import List -from transformers import T5Tokenizer, MT5ForConditionalGeneration + +from transformers import MT5ForConditionalGeneration, T5Tokenizer from pythainlp.summarize import CPE_KMUTT_THAI_SENTENCE_SUM