diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 0132862f9..de7414c14 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -4,6 +4,7 @@ about: Propose a change or an addition เสนอความสามาร --- ## Detailed description + ## Context @@ -14,7 +15,8 @@ about: Propose a change or an addition เสนอความสามาร ## Your environment -* PyThaiNLP version: -* Python version: -* Operating system and version (distro, 32/64-bit): -* More info (Docker, VM, etc.): + +- PyThaiNLP version: +- Python version: +- Operating system and version (distro, 32/64-bit): +- More info (Docker, VM, etc.): diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 61435ef5e..63f149aac 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -13,6 +13,7 @@ Description of how the changes fix the issue. Fixes #... ### Your checklist for this pull request + 🚨Please review the [guidelines for contributing](https://github.com/PyThaiNLP/pythainlp/blob/dev/CONTRIBUTING.md) to this repository. - [ ] Passed code styles and structures diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 72e1eb6ef..0baee703e 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -10,21 +10,21 @@ on: jobs: release: name: Build - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: "3.10" - name: Install dependencies env: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | - python -m pip install --upgrade pip + python -m pip install --upgrade "pip==24.0" "setuptools" pip install pytest coverage coveralls if [ -f docker_requirements.txt ]; then pip install -r docker_requirements.txt; fi - pip install deepcut pip install .[full] pip install boto smart_open sphinx sphinx-rtd-theme python -m nltk.downloader omw-1.4 diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml deleted file mode 100644 index 64b17524e..000000000 --- a/.github/workflows/macos-test.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: macOS Unit test and code coverage - -on: - push: - paths-ignore: - - '**.md' - - 'docs/**' -# - '**.yml' - pull_request: - branches: - - dev - paths-ignore: - - '**.md' - - '**.yml' - - 'docs/**' - -jobs: - build: - - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-latest, self-hosted] - python-version: [3.8] - - steps: - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v2 - with: - python-version: ${{ matrix.python-version }} - auto-activate-base: false - auto-update-conda: true - if: matrix.os == 'macos-latest' -# - name: Install mac m1 -# run: | -# mkdir -p ~/miniconda3 -# wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.12.0-MacOSX-arm64.sh -# chmod +x Miniconda3-py38_4.12.0-MacOSX-arm64.sh -# bash Miniconda3-py38_4.12.0-MacOSX-arm64.sh -b -u -p ~/miniconda3 -# ~/miniconda3/bin/conda init bash -# ~/miniconda3/bin/conda init zsh -# if: matrix.os == 'self-hosted' - - name: Test PyThaiNLP - M1 - shell: bash -l {0} - run: | - source ~/miniconda3/etc/profile.d/conda.sh - conda create -y -n pythainlpwork38 python=3.8 - conda activate pythainlpwork38 - conda info - conda list - python -m pip install --upgrade pip - SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt - conda install -y -c conda-forge protobuf - pip install pytest coverage coveralls typing_extensions==4.5.0 - pip install ssg epitran - pip install fastai==1.0.61 - pip install fairseq==0.10.2 - conda install -y -c conda-forge icu - conda install -y -c conda-forge pyicu - pip install deepcut tltk - pip install .[full] - python -m nltk.downloader omw-1.4 - python -m pip cache purge - python -m unittest discover - if: matrix.os == 'self-hosted' - - shell: bash -l {0} - run: | - conda info - conda list - if: matrix.os == 'self-hosted' - - name: Install PyTorch - shell: bash -l {0} - run: | - pip install torch==1.10.0 - if: matrix.os != 'self-hosted' - - name: Install dependencies - shell: bash -l {0} - run: | - python -m pip install --upgrade pip - pip install pytest coverage coveralls - conda install -c conda-forge icu - conda install -c conda-forge pyicu - SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt - pip install deepcut tltk - pip install .[full] - python -m nltk.downloader omw-1.4 - python -m pip cache purge - if: matrix.os != 'self-hosted' - - name: Test - shell: bash -l {0} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_SERVICE_NAME: github - run: | - coverage run -m unittest discover - coveralls - if: matrix.os != 'self-hosted' diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 387d8ad63..d3ec7d878 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -7,21 +7,22 @@ on: jobs: deploy: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 strategy: matrix: - python-version: [3.8] + python-version: ["3.10"] steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine + python -m pip install --upgrade "pip==24.0" "setuptools" + pip install wheel twine python setup.py sdist bdist_wheel - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 23181bf5f..a10773537 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -7,23 +7,23 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 strategy: matrix: - python-version: [3.8] + python-version: ["3.10"] steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies env: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | - python -m pip install --upgrade pip - pip install deepcut tltk + python -m pip install --upgrade "pip==24.0" "setuptools" SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt pip install pythainlp[full] python -m nltk.downloader omw-1.4 diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml new file mode 100644 index 000000000..647b21846 --- /dev/null +++ b/.github/workflows/test-macos.yml @@ -0,0 +1,62 @@ +name: Unit test and code coverage (macOS) + +on: + push: + paths-ignore: + - '**.md' + - 'docs/**' +# - '**.yml' + pull_request: + branches: + - dev + paths-ignore: + - '**.md' + - '**.yml' + - 'docs/**' + +jobs: + build: + + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install the ICU library + run: | + brew install icu4c + PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig + echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "${GITHUB_ENV}" + - name: Determine the ICU version + run: | + ICU_VER=$(pkg-config --modversion icu-i18n) + echo "ICU_VER=${ICU_VER}" + echo "ICU_VER=${ICU_VER}" >> "${GITHUB_ENV}" + - name: Install dependencies + shell: bash -l {0} + run: | + python -m pip install --upgrade "pip==24.0" "setuptools" + python -m pip --version + python -m pip show setuptools + python -m pip install pytest coverage coveralls + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True python -m pip install -r docker_requirements.txt + python -m pip install .[full] + python -m nltk.downloader omw-1.4 + python -m pip cache purge + - name: Test + shell: bash -l {0} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COVERALLS_SERVICE_NAME: github + run: | + coverage run -m unittest discover + coveralls diff --git a/.github/workflows/test.yml b/.github/workflows/test-ubuntu.yml similarity index 53% rename from .github/workflows/test.yml rename to .github/workflows/test-ubuntu.yml index 352495b29..01042853a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test-ubuntu.yml @@ -1,4 +1,4 @@ -name: Unit test and code coverage +name: Unit test and code coverage (Ubuntu) on: push: @@ -15,27 +15,28 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: - python-version: [3.9] + python-version: ["3.10"] steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install backports.zoneinfo[tzdata] - pip install pytest coverage coveralls - SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt - pip install deepcut tltk - pip install .[full] + python -m pip install --upgrade "pip==24.0" "setuptools" + python -m pip install "numpy==2.1.2" + python -m pip --version + python -m pip show setuptools + python -m pip install pytest coverage coveralls + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True python -m pip install -r docker_requirements.txt + python -m pip install .[full] python -m nltk.downloader omw-1.4 - python -m pip install spacy deepcut tltk python -m pip cache purge - name: Test env: diff --git a/.github/workflows/windows-test.yml b/.github/workflows/test-windows.yml similarity index 60% rename from .github/workflows/windows-test.yml rename to .github/workflows/test-windows.yml index 4883aa22c..420cd191a 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/test-windows.yml @@ -1,4 +1,4 @@ -name: Windows Unit test and code coverage +name: Unit test and code coverage (Windows) on: push: @@ -22,38 +22,28 @@ jobs: fail-fast: false matrix: os: [windows-latest] - python-version: [3.8] + python-version: ["3.10"] steps: - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v2 + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - auto-activate-base: true - auto-update-conda: true - - shell: powershell - run: | - conda info - conda list - - name: Install PyTorch - shell: powershell - run: | - pip install torch==1.8.1 - name: Install dependencies shell: powershell env: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | - python -m pip install --disable-pip-version-check --user --upgrade pip setuptools - python -m pip install backports.zoneinfo[tzdata] + python -m pip install --disable-pip-version-check --user --upgrade "pip==24.0" "setuptools" python -m pip --version + python -m pip show setuptools==65.5.1 python -m pip install pytest coverage coveralls - conda install -y -c conda-forge fairseq - python -m pip install https://www.dropbox.com/s/o6p2sj5z50iim1e/PyICU-2.3.1-cp38-cp38-win_amd64.whl?dl=1 + python -m pip install "https://github.com/cgohlke/pyicu-build/releases/download/v2.13/PyICU-2.13-cp310-cp310-win_amd64.whl" python -m pip install -r docker_requirements.txt python -m pip install .[full] python -m nltk.downloader omw-1.4 - python -m pip install spacy deepcut tltk - name: Test shell: powershell env: diff --git a/.gitignore b/.gitignore index af16c6179..a7f5543b2 100644 --- a/.gitignore +++ b/.gitignore @@ -119,3 +119,5 @@ notebooks/iso_11940-dev.ipynb # vscode devcontainer .devcontainer/ notebooks/d.model + +logs/ diff --git a/Dockerfile b/Dockerfile index dc8162af7..9e26fd6f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -FROM python:3.8-slim-buster +FROM python:3.9-slim COPY . . RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev libicu63 pkg-config && rm -rf /var/lib/apt/lists/* -RUN pip3 install --upgrade pip setuptools +RUN pip3 install --upgrade "pip==24.0" "setuptools" RUN if [ -f docker_requirements.txt ]; then pip3 install -r docker_requirements.txt; fi RUN pip3 install -e .[full] && pip3 cache purge diff --git a/docker_requirements.txt b/docker_requirements.txt index fc6b7947b..84ee6b57b 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -1,41 +1,39 @@ -PyYAML==5.4.1 +PyYAML==6.0.2 attacut==1.0.6 -bpemb==0.3.4 -deepcut==0.7.0.0 +bpemb==0.3.6 emoji==0.5.4 -epitran==1.9 -esupar==1.3.9 -fairseq==0.10.2 +epitran==1.25.1 +esupar==1.7.5 +fairseq==0.12.2 fastai==1.0.61 fastcoref==2.1.6 -gensim==4.3.2 -h5py==3.1.0 -khanaa==0.0.6 +gensim==4.3.2 # https://github.com/piskvorky/gensim/issues/3560 +h5py==3.12.1 +khanaa==0.1.1 nlpo3==1.3.0 -nltk==3.6.6 -numpy==1.22.* -OSKut==1.3 -pandas==1.4.* -panphon==0.20.0 +nltk==3.9.1 +numpy==2.1.2 +pandas==1.5.3 +panphon==0.21.2 phunspell==0.1.6 protobuf==3.20.3 -pyicu==2.8 -python-crfsuite==0.9.9 -requests==2.31.* -sacremoses==0.0.41 -sefr_cut==1.1 -sentence-transformers==2.2.2 -sentencepiece==0.1.99 -spacy_thai==0.7.1 -spacy==3.5.* -spylls==0.1.5 +pyicu==2.13 +python-crfsuite==0.9.11 +requests==2.32.3 +sacremoses==0.1.1 +scikit-learn==1.5.2 +sentence-transformers==3.2.1 +sentencepiece==0.2.0 +spacy_thai==0.7.7 +spacy==3.8.2 +spylls==0.1.7 ssg==0.0.8 -symspellpy==6.7.7 -tensorflow==2.13.1 +symspellpy==6.7.8 +#tensorflow==2.18.0 thai-nner==0.3 -tltk==1.6.8 +tltk==1.9.1 torch==1.13.1 -transformers==4.38.0 -ufal.chu-liu-edmonds==1.0.2 -wtpsplit==1.0.1 +transformers==4.46.0 +ufal.chu-liu-edmonds==1.0.3 +wtpsplit==1.3.0 wunsen==0.0.3 diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 1f42ab128..10fc8b2be 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -80,13 +80,6 @@ Word level A tokenizer designed for word-level segmentation. It provides accurate word boundary detection in Thai text. -**deepcut** - -.. automodule:: pythainlp.tokenize.deepcut - :members: - - Utilizes deep learning techniques for word segmentation, achieving high accuracy and performance. - **multi_cut** .. automodule:: pythainlp.tokenize.multi_cut @@ -122,20 +115,6 @@ Word level A tokenizer optimized for Named Entity Recognition (NER) tasks, ensuring accurate tokenization for entity recognition. -**sefr_cut** - -.. automodule:: pythainlp.tokenize.sefr_cut - :members: - - An advanced word tokenizer for segmenting Thai text, with a focus on precision. - -**oskut** - -.. automodule:: pythainlp.tokenize.oskut - :members: - - A tokenizer that uses a pre-trained model for word segmentation. It's a reliable choice for general-purpose text analysis. - **newmm (Default)** .. automodule:: pythainlp.tokenize.newmm diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index a22164a12..ddcbb90de 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -29,7 +29,6 @@ where ``extras`` can be - ``spell`` (to support phunspell & symspellpy) - ``generate`` (to support text generate with umlfit or thai2fit) - ``textaugment`` (to support text augmentation) - - ``oskut`` (to support OSKUT) - ``nlpo3`` (to support nlpo3 engine) - ``spacy_thai`` (to support spacy_thai engine) - ``esupar`` (to support esupar engine) diff --git a/pyproject.toml b/pyproject.toml index faa102a79..f30ea6242 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ [tool.ruff] line-length = 79 indent-width = 4 -target-version = "py38" +target-version = "py310" [tool.ruff.format] quote-style = "double" diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 993bcff64..04b35e061 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -141,9 +141,6 @@ def word_tokenize( * *attacut* - wrapper for `AttaCut `_., learning-based approach - * *deepcut* - wrapper for - `DeepCut `_, - learning-based approach * *icu* - wrapper for a word tokenizer in `PyICU `_., from ICU (International Components for Unicode), @@ -162,18 +159,12 @@ def word_tokenize( * *nlpo3* - wrapper for a word tokenizer in `nlpO3 `_., adaptation of newmm in Rust (2.5x faster) - * *oskut* - wrapper for - `OSKut `_., - Out-of-domain StacKed cut for Word Segmentation - * *sefr_cut* - wrapper for - `SEFR CUT `_., - Stacked Ensemble Filter and Refine for Word Segmentation * *tltk* - wrapper for `TLTK `_., maximum collocation approach :Note: - The **custom_dict** parameter only works for \ - *deepcut*, *longest*, *newmm*, and *newmm-safe* engines. + *longest*, *newmm*, and *newmm-safe* engines. :Example: Tokenize text with different tokenizers:: @@ -260,14 +251,6 @@ def word_tokenize( from pythainlp.tokenize.multi_cut import segment segments = segment(text, custom_dict) - elif engine == "deepcut": # deepcut can optionally use dictionary - from pythainlp.tokenize.deepcut import segment - - if custom_dict: - custom_dict = list(custom_dict) - segments = segment(text, custom_dict) - else: - segments = segment(text) elif engine == "icu": from pythainlp.tokenize.pyicu import segment @@ -275,18 +258,10 @@ def word_tokenize( elif engine == "nercut": from pythainlp.tokenize.nercut import segment - segments = segment(text) - elif engine == "sefr_cut": - from pythainlp.tokenize.sefr_cut import segment - segments = segment(text) elif engine == "tltk": from pythainlp.tokenize.tltk import segment - segments = segment(text) - elif engine == "oskut": - from pythainlp.tokenize.oskut import segment - segments = segment(text) elif engine == "nlpo3": from pythainlp.tokenize.nlpo3 import segment @@ -747,7 +722,7 @@ def __init__( used to create a trie, or an instantiated :class:`pythainlp.util.Trie` object. :param str engine: choose between different options of tokenizer engines - (i.e. *newmm*, *mm*, *longest*, *deepcut*) + (i.e. *newmm*, *mm*, *longest*) :param bool keep_whitespace: True to keep whitespace, a common mark for end of phrase in Thai """ @@ -757,7 +732,7 @@ def __init__( else: self.__trie_dict = DEFAULT_WORD_DICT_TRIE self.__engine = engine - if self.__engine not in ["newmm", "mm", "longest", "deepcut"]: + if self.__engine not in ["newmm", "mm", "longest"]: raise NotImplementedError( """ The Tokenizer class is not support %s for custom tokenizer @@ -788,6 +763,6 @@ def set_tokenize_engine(self, engine: str) -> None: Set the tokenizer's engine. :param str engine: choose between different options of tokenizer engines - (i.e. *newmm*, *mm*, *longest*, *deepcut*) + (i.e. *newmm*, *mm*, *longest*) """ self.__engine = engine diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py deleted file mode 100644 index 38178f344..000000000 --- a/pythainlp/tokenize/deepcut.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 -""" -Wrapper for deepcut Thai word segmentation. deepcut is a -Thai word segmentation library using 1D Convolution Neural Network. - -User need to install deepcut (and its dependency: tensorflow) by themselves. - -:See Also: - * `GitHub repository `_ -""" - -from typing import List, Union - -try: - from deepcut import tokenize -except ImportError: - raise ImportError("Please install deepcut by pip install deepcut") -from pythainlp.util import Trie - - -def segment( - text: str, custom_dict: Union[Trie, List[str], str] = [] -) -> List[str]: - if not text or not isinstance(text, str): - return [] - - if custom_dict: - if isinstance(custom_dict, Trie): - custom_dict = list(custom_dict) - - return tokenize(text, custom_dict) - - return tokenize(text) diff --git a/pythainlp/tokenize/oskut.py b/pythainlp/tokenize/oskut.py deleted file mode 100644 index ffe9bc61f..000000000 --- a/pythainlp/tokenize/oskut.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 -""" -Wrapper OSKut (Out-of-domain StacKed cut for Word Segmentation). -Handling Cross- and Out-of-Domain Samples in Thai Word Segmentation -Stacked Ensemble Framework and DeepCut as Baseline model (ACL 2021 Findings) - -:See Also: - * `GitHub repository `_ -""" -from typing import List - -import oskut - -DEFAULT_ENGINE = "ws" -oskut.load_model(engine=DEFAULT_ENGINE) - - -def segment(text: str, engine: str = "ws") -> List[str]: - global DEFAULT_ENGINE - if not text or not isinstance(text, str): - return [] - if engine != DEFAULT_ENGINE: - DEFAULT_ENGINE = engine - oskut.load_model(engine=DEFAULT_ENGINE) - return oskut.OSKut(text) diff --git a/pythainlp/tokenize/sefr_cut.py b/pythainlp/tokenize/sefr_cut.py deleted file mode 100644 index 34579e2eb..000000000 --- a/pythainlp/tokenize/sefr_cut.py +++ /dev/null @@ -1,26 +0,0 @@ -# -*- coding: utf-8 -*- -# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 -""" -Wrapper for SEFR CUT Thai word segmentation. SEFR CUT is a -Thai Word Segmentation Models using Stacked Ensemble. - -:See Also: - * `GitHub repository `_ -""" -from typing import List - -import sefr_cut - -DEFAULT_ENGINE = "ws1000" -sefr_cut.load_model(engine=DEFAULT_ENGINE) - - -def segment(text: str, engine: str = "ws1000") -> List[str]: - global DEFAULT_ENGINE - if not text or not isinstance(text, str): - return [] - if engine != DEFAULT_ENGINE: - DEFAULT_ENGINE = engine - sefr_cut.load_model(engine=DEFAULT_ENGINE) - return sefr_cut.tokenize(text)[0] diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index d2e03a11a..3d8c250b5 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -26,10 +26,7 @@ from typing import Union import re -try: - from zoneinfo import ZoneInfo -except ImportError: - from backports.zoneinfo import ZoneInfo +from zoneinfo import ZoneInfo thai_abbr_weekdays = ["จ", "อ", "พ", "พฤ", "ศ", "ส", "อา"] @@ -236,7 +233,7 @@ def thai_strptime( # 9, # 0, # 1, - # tzinfo=backports.zoneinfo.ZoneInfo(key='Asia/Bangkok') + # tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok') # ) """ d = "" diff --git a/requirements.txt b/requirements.txt index 9daa11484..c9620fc0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -PyYAML==5.4.1 -numpy==1.22.* +PyYAML==6.0.* +numpy==2.1.* python-crfsuite==0.9.* -requests==2.31.* +requests==2.32.* diff --git a/setup.py b/setup.py index 72897d889..0ed4d26d4 100644 --- a/setup.py +++ b/setup.py @@ -38,8 +38,6 @@ requirements = [ "requests>=2.22.0", - "backports.zoneinfo; python_version<'3.9'", - "tzdata; sys_platform == 'win32'" ] extras = { @@ -71,13 +69,11 @@ "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"], "wordnet": ["nltk>=3.3"], "generate": ["fastai<2.0"], - "sefr_cut": ["sefr_cut>=1.1"], "spell": [ "phunspell>=0.1.6", "spylls>=0.1.5", "symspellpy>=6.7.6" ], - "oskut": ["oskut>=1.3"], "nlpo3": ["nlpo3>=1.2.2"], "onnx": [ "sentencepiece>=0.1.91", @@ -139,11 +135,9 @@ "fastai<2.0", "bpemb>=0.3.2", "transformers>=4.22.1", - "sefr_cut>=1.1", "phunspell>=0.1.6", "spylls>=0.1.5", "symspellpy>=6.7.6", - "oskut>=1.3", "nlpo3>=1.2.2", "onnxruntime>=1.10.0", "thai_nner", @@ -170,7 +164,7 @@ url="https://github.com/PyThaiNLP/pythainlp", packages=find_packages(exclude=["tests", "tests.*"]), test_suite="tests", - python_requires=">=3.7", + python_requires=">=3.9", package_data={ "pythainlp": [ "corpus/*", diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index ad5a1f5e9..f7600e0e7 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -8,16 +8,13 @@ DEFAULT_WORD_DICT_TRIE, Tokenizer, attacut, - deepcut, etcc, longest, multi_cut, nercut, newmm, - oskut, paragraph_tokenize, pyicu, - sefr_cut, sent_tokenize, ssg, subword_tokenize, @@ -455,15 +452,12 @@ def test_word_tokenize(self): ) self.assertIsNotNone(word_tokenize(self.text_1, engine="nlpo3")) self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) self.assertIsNotNone(word_tokenize(self.text_1, engine="longest")) self.assertIsNotNone(word_tokenize(self.text_1, engine="mm")) self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="sefr_cut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="tltk")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="oskut")) with self.assertRaises(ValueError): word_tokenize("หมอนทอง", engine="XX") # engine does not exist @@ -487,18 +481,6 @@ def test_attacut(self): attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c") ) - def test_deepcut(self): - self.assertEqual(deepcut.segment(None), []) - self.assertEqual(deepcut.segment(""), []) - self.assertIsNotNone(deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE)) - self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) - self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) - self.assertIsNotNone( - word_tokenize( - "ทดสอบ", engine="deepcut", custom_dict=DEFAULT_WORD_DICT_TRIE - ) - ) - def test_etcc(self): self.assertEqual(etcc.segment(None), []) self.assertEqual(etcc.segment(""), []) @@ -797,26 +779,6 @@ def test_tcc_p(self): self.assertEqual(list(tcc_p.tcc("")), []) self.assertEqual(tcc_p.tcc_pos(""), set()) - def test_sefr_cut(self): - self.assertEqual(sefr_cut.segment(None), []) - self.assertEqual(sefr_cut.segment(""), []) - self.assertIsNotNone( - sefr_cut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), - ) - self.assertIsNotNone( - sefr_cut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tnhc"), - ) - - def test_oskut(self): - self.assertEqual(oskut.segment(None), []) - self.assertEqual(oskut.segment(""), []) - self.assertIsNotNone( - oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), - ) - self.assertIsNotNone( - oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="scads"), - ) - def test_word_detokenize(self): self.assertEqual( word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" @@ -842,7 +804,7 @@ def test_word_detokenize(self): ) def test_numeric_data_format(self): - engines = ["attacut", "deepcut", "newmm", "sefr_cut"] + engines = ["attacut", "newmm"] for engine in engines: self.assertIn(