From 1d7f2734d45d9edaa34405343a9065b805455560 Mon Sep 17 00:00:00 2001 From: mike0sv Date: Tue, 24 May 2022 17:03:58 +0300 Subject: [PATCH 1/4] highly experimental and WIP fastai support --- .pylintrc | 2 +- mlem/contrib/catboost.py | 27 +++++----- mlem/contrib/fastai.py | 95 ++++++++++++++++++++++++++++++++++++ mlem/contrib/lightgbm.py | 19 ++++---- mlem/contrib/xgboost.py | 23 +++++---- mlem/core/model.py | 14 ++++++ mlem/ext.py | 1 + mlem/utils/module.py | 4 +- setup.py | 4 ++ tests/contrib/test_fastai.py | 5 ++ 10 files changed, 154 insertions(+), 40 deletions(-) create mode 100644 mlem/contrib/fastai.py create mode 100644 tests/contrib/test_fastai.py diff --git a/.pylintrc b/.pylintrc index 4ea818e6..3c92f8f9 100644 --- a/.pylintrc +++ b/.pylintrc @@ -395,7 +395,7 @@ ignore-imports=no ignore-signatures=no # Minimum lines number of a similarity. -min-similarity-lines=15 +min-similarity-lines=20 [BASIC] diff --git a/mlem/contrib/catboost.py b/mlem/contrib/catboost.py index 655a2bee..a28b795f 100644 --- a/mlem/contrib/catboost.py +++ b/mlem/contrib/catboost.py @@ -1,15 +1,18 @@ -import os -import posixpath -import tempfile from enum import Enum from typing import Any, ClassVar, Optional import catboost from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor -from mlem.core.artifacts import Artifacts, Storage +from mlem.core.artifacts import Artifacts from mlem.core.hooks import IsInstanceHookMixin -from mlem.core.model import ModelHook, ModelIO, ModelType, Signature +from mlem.core.model import ( + BufferModelIO, + ModelHook, + ModelIO, + ModelType, + Signature, +) from mlem.core.requirements import InstallableRequirement, Requirements @@ -18,7 +21,7 @@ class CBType(str, Enum): regressor = "reg" -class CatBoostModelIO(ModelIO): +class CatBoostModelIO(BufferModelIO): """ :class:`mlem.core.model.ModelIO` for CatBoost models. """ @@ -28,16 +31,8 @@ class CatBoostModelIO(ModelIO): regressor_file_name: ClassVar = "rgr.cb" model_type: CBType = CBType.regressor - def dump(self, storage: Storage, path, model) -> Artifacts: - with tempfile.TemporaryDirectory() as tmpdir: - model_name = self._get_model_file_name(model) - model_path = os.path.join(tmpdir, model_name) - model.save_model(model_path) - return { - self.art_name: storage.upload( - model_path, posixpath.join(path, model_name) - ) - } + def save_model(self, model: Any, path: str): + model.save_model(path) def load(self, artifacts: Artifacts): """ diff --git a/mlem/contrib/fastai.py b/mlem/contrib/fastai.py new file mode 100644 index 00000000..53a53dc7 --- /dev/null +++ b/mlem/contrib/fastai.py @@ -0,0 +1,95 @@ +from typing import Any, ClassVar, Optional, Type, Union + +from fastai.data.transforms import Category +from fastai.learner import Learner, load_learner +from fastai.vision.core import PILImage +from pydantic import BaseModel + +from mlem.core.artifacts import Artifacts +from mlem.core.dataset_type import DatasetHook, DatasetSerializer, DatasetType +from mlem.core.hooks import IsInstanceHookMixin +from mlem.core.model import BufferModelIO, ModelHook, ModelType, Signature +from mlem.core.requirements import Requirements + + +class FastAIModelIO(BufferModelIO): + type: ClassVar = "fastai" + + def save_model(self, model: Any, path: str): + model.export(path) + + def load(self, artifacts: Artifacts): + with artifacts[self.art_name].open() as f: + return load_learner(f) + + +class FastAIModel(ModelType, ModelHook, IsInstanceHookMixin): + type: ClassVar = "fastai" + valid_types: ClassVar = (Learner,) + io: FastAIModelIO = FastAIModelIO() + + @classmethod + def process( + cls, obj: "Learner", sample_data: Optional[Any] = None, **kwargs + ) -> ModelType: + + return FastAIModel( + methods={ + "predict": Signature.from_method( + obj.predict, item=sample_data, auto_infer=True + ) + } + ) + + +class CategoryDataType( + DatasetType, DatasetSerializer, DatasetHook, IsInstanceHookMixin +): + type: ClassVar = "fastai_category" + valid_types: ClassVar = (Category,) + value: str + + def serialize(self, instance: Any) -> dict: + raise NotImplementedError # TODO + + def deserialize(self, obj: dict) -> Any: + raise NotImplementedError # TODO + + def get_model(self, prefix: str = "") -> Union[Type[BaseModel], type]: + raise NotImplementedError # TODO + + def get_requirements(self) -> Requirements: + return Requirements.new("fastai") + + @classmethod + def process(cls, obj: Any, **kwargs): + return CategoryDataType(value=str(obj)) + + def get_writer(self, **kwargs): + raise NotImplementedError # TODO + + +class PILImageDataType( + DatasetType, DatasetSerializer, DatasetHook, IsInstanceHookMixin +): + type: ClassVar = "fastai_pil_image" + valid_types: ClassVar = (PILImage,) + + def serialize(self, instance: Any) -> dict: + raise NotImplementedError # TODO + + def deserialize(self, obj: dict) -> Any: + raise NotImplementedError # TODO + + def get_model(self, prefix: str = "") -> Union[Type[BaseModel], type]: + raise NotImplementedError # TODO + + def get_requirements(self) -> Requirements: + return Requirements.new("fastai") + + @classmethod + def process(cls, obj: Any, **kwargs): + return PILImageDataType() + + def get_writer(self, **kwargs): + raise NotImplementedError # TODO diff --git a/mlem/contrib/lightgbm.py b/mlem/contrib/lightgbm.py index 1e697958..4c69355b 100644 --- a/mlem/contrib/lightgbm.py +++ b/mlem/contrib/lightgbm.py @@ -1,5 +1,4 @@ import os -import posixpath import tempfile from typing import Any, ClassVar, Iterator, List, Optional, Tuple, Type @@ -18,7 +17,13 @@ ) from mlem.core.errors import DeserializationError, SerializationError from mlem.core.hooks import IsInstanceHookMixin -from mlem.core.model import ModelHook, ModelIO, ModelType, Signature +from mlem.core.model import ( + BufferModelIO, + ModelHook, + ModelIO, + ModelType, + Signature, +) from mlem.core.requirements import ( AddRequirementHook, InstallableRequirement, @@ -120,7 +125,7 @@ def read_batch( raise NotImplementedError -class LightGBMModelIO(ModelIO): +class LightGBMModelIO(BufferModelIO): """ :class:`.ModelIO` implementation for `lightgbm.Booster` type """ @@ -128,12 +133,8 @@ class LightGBMModelIO(ModelIO): type: ClassVar[str] = "lightgbm_io" model_file_name = "model.lgb" - def dump(self, storage: Storage, path, model) -> Artifacts: - with tempfile.TemporaryDirectory(prefix="mlem_lightgbm_dump") as f: - model_path = os.path.join(f, self.model_file_name) - model.save_model(model_path) - fs_path = posixpath.join(path, self.model_file_name) - return {self.art_name: storage.upload(model_path, fs_path)} + def save_model(self, model: Any, path: str): + model.save_model(path) def load(self, artifacts: Artifacts): if len(artifacts) != 1: diff --git a/mlem/contrib/xgboost.py b/mlem/contrib/xgboost.py index f619fa3f..53208024 100644 --- a/mlem/contrib/xgboost.py +++ b/mlem/contrib/xgboost.py @@ -1,5 +1,4 @@ import os -import posixpath import tempfile from typing import Any, ClassVar, Dict, List, Optional, Type @@ -8,7 +7,7 @@ from mlem.constants import PREDICT_METHOD_NAME from mlem.contrib.numpy import python_type_from_np_string_repr -from mlem.core.artifacts import Artifacts, Storage +from mlem.core.artifacts import Artifacts from mlem.core.dataset_type import ( DatasetHook, DatasetSerializer, @@ -17,7 +16,13 @@ ) from mlem.core.errors import DeserializationError, SerializationError from mlem.core.hooks import IsInstanceHookMixin -from mlem.core.model import ModelHook, ModelIO, ModelType, Signature +from mlem.core.model import ( + BufferModelIO, + ModelHook, + ModelIO, + ModelType, + Signature, +) from mlem.core.requirements import ( AddRequirementHook, InstallableRequirement, @@ -115,7 +120,7 @@ def get_model(self, prefix: str = "") -> Type[BaseModel]: raise NotImplementedError -class XGBoostModelIO(ModelIO): +class XGBoostModelIO(BufferModelIO): """ :class:`~.ModelIO` implementation for XGBoost models """ @@ -123,14 +128,8 @@ class XGBoostModelIO(ModelIO): type: ClassVar[str] = "xgboost_io" model_file_name = "model.xgb" - def dump( - self, storage: Storage, path, model: xgboost.Booster - ) -> Artifacts: - with tempfile.TemporaryDirectory(prefix="mlem_xgboost_dump") as f: - local_path = os.path.join(f, self.model_file_name) - model.save_model(local_path) - remote_path = posixpath.join(path, self.model_file_name) - return {self.art_name: storage.upload(local_path, remote_path)} + def save_model(self, model: Any, path: str): + model.save_model(path) def load(self, artifacts: Artifacts): if len(artifacts) != 1: diff --git a/mlem/core/model.py b/mlem/core/model.py index 24c1bfab..1d9c9cd1 100644 --- a/mlem/core/model.py +++ b/mlem/core/model.py @@ -2,7 +2,9 @@ Base classes to work with ML models in MLEM """ import inspect +import os import pickle +import tempfile from abc import ABC, abstractmethod from typing import ( Any, @@ -54,6 +56,18 @@ def load(self, artifacts: Artifacts): raise NotImplementedError +class BufferModelIO(ModelIO, ABC): + @abstractmethod + def save_model(self, model: Any, path: str): + raise NotImplementedError + + def dump(self, storage: Storage, path, model) -> Artifacts: + with tempfile.TemporaryDirectory() as tmpdir: + model_path = os.path.join(tmpdir, "model") + self.save_model(model, model_path) + return {self.art_name: storage.upload(model_path, path)} + + class SimplePickleIO(ModelIO): """IO with simple pickling of python model object""" diff --git a/mlem/ext.py b/mlem/ext.py index 63d2a7b8..c9615545 100644 --- a/mlem/ext.py +++ b/mlem/ext.py @@ -95,6 +95,7 @@ class ExtensionLoader: Extension("mlem.contrib.fastapi", ["fastapi", "uvicorn"], False), Extension("mlem.contrib.callable", [], True), Extension("mlem.contrib.rabbitmq", ["pika"], False), + Extension("mlem.contrib.fastai", ["fastai"], False), ) _loaded_extensions: Dict[Extension, ModuleType] = {} diff --git a/mlem/utils/module.py b/mlem/utils/module.py index 02297997..7a4118c5 100644 --- a/mlem/utils/module.py +++ b/mlem/utils/module.py @@ -229,7 +229,7 @@ def is_from_installable_module(obj: object): return is_installable_module(mod) -def get_module_version(mod: ModuleType): +def get_module_version(mod: ModuleType) -> Optional[str]: """ Determines version of given module object. @@ -238,7 +238,7 @@ def get_module_version(mod: ModuleType): """ for attr in "__version__", "VERSION": if hasattr(mod, attr): - return getattr(mod, attr) + return str(getattr(mod, attr)) if mod.__file__ is None: return None for name in os.listdir(os.path.dirname(mod.__file__)): diff --git a/setup.py b/setup.py index 851e4e8b..2269dfa5 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,8 @@ "dataset_reader.numpy = mlem.contrib.numpy:NumpyArrayReader", "dataset_reader.pandas = mlem.contrib.pandas:PandasReader", "dataset_type.dataframe = mlem.contrib.pandas:DataFrameType", + "dataset_type.fastai_category = mlem.contrib.fastai:CategoryDataType", + "dataset_type.fastai_pil_image = mlem.contrib.fastai:PILImageDataType", "dataset_type.series = mlem.contrib.pandas:SeriesType", "dataset_type.lightgbm = mlem.contrib.lightgbm:LightGBMDatasetType", "dataset_type.ndarray = mlem.contrib.numpy:NumpyNdarrayType", @@ -144,6 +146,7 @@ "model_io.pickle = mlem.contrib.callable:PickleModelIO", "model_io.xgboost_io = mlem.contrib.xgboost:XGBoostModelIO", "model_io.torch_io = mlem.contrib.torch:TorchModelIO", + "model_io.fastai = mlem.contrib.fastai:FastAIModelIO", "model_type.callable = mlem.contrib.callable:CallableModelType", "model_type.catboost = mlem.contrib.catboost:CatBoostModel", "model_type.lightgbm = mlem.contrib.lightgbm:LightGBMModel", @@ -151,6 +154,7 @@ "model_type.sklearn_pipeline = mlem.contrib.sklearn:SklearnPipelineType", "model_type.xgboost = mlem.contrib.xgboost:XGBoostModel", "model_type.torch = mlem.contrib.torch:TorchModel", + "model_type.fastai = mlem.contrib.fastai:FastAIModel", "packager.docker = mlem.contrib.docker.base:DockerImagePackager", "packager.docker_dir = mlem.contrib.docker.base:DockerDirPackager", "packager.pip = mlem.contrib.pip.base:PipPackager", diff --git a/tests/contrib/test_fastai.py b/tests/contrib/test_fastai.py new file mode 100644 index 00000000..35be0700 --- /dev/null +++ b/tests/contrib/test_fastai.py @@ -0,0 +1,5 @@ +# TODO + + +def test_learner(): + pass From 231964029402fc94cdc759f37abc9fe1414c1903 Mon Sep 17 00:00:00 2001 From: mike0sv Date: Tue, 24 May 2022 17:04:32 +0300 Subject: [PATCH 2/4] highly experimental and WIP fastai support --- .pylintrc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pylintrc b/.pylintrc index 3c92f8f9..75fac659 100644 --- a/.pylintrc +++ b/.pylintrc @@ -389,13 +389,13 @@ ignore-comments=yes ignore-docstrings=yes # Ignore imports when computing similarities. -ignore-imports=no +ignore-imports=yes # Ignore function signatures when computing similarities. ignore-signatures=no # Minimum lines number of a similarity. -min-similarity-lines=20 +min-similarity-lines=15 [BASIC] From 34d4e6f40e1f9ebab331778e378e88b4695692d4 Mon Sep 17 00:00:00 2001 From: mike0sv Date: Mon, 6 Jun 2022 21:50:11 -0400 Subject: [PATCH 3/4] add secret envs --- .github/workflows/check-test-release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/check-test-release.yml b/.github/workflows/check-test-release.yml index e82b3663..f0fff764 100644 --- a/.github/workflows/check-test-release.yml +++ b/.github/workflows/check-test-release.yml @@ -88,6 +88,8 @@ jobs: HEROKU_TEAM: iterative-sandbox GITHUB_MATRIX_OS: ${{ matrix.os }} GITHUB_MATRIX_PYTHON: ${{ matrix.python }} + BITBUCKET_USERNAME: ${{ secrets.BITBUCKET_USERNAME }} + BITBUCKET_PASSWORD: ${{ secrets.BITBUCKET_PASSWORD }} - name: "Upload coverage to Codecov" uses: codecov/codecov-action@v1 with: From 5e089f76b50a8c4eed5b5f1de1100f0f975560dd Mon Sep 17 00:00:00 2001 From: mike0sv Date: Thu, 16 Jun 2022 16:20:51 +0300 Subject: [PATCH 4/4] fix rename and namespace package --- mlem/contrib/fastai.py | 18 ++++++++++++------ mlem/utils/module.py | 4 +++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/mlem/contrib/fastai.py b/mlem/contrib/fastai.py index 53a53dc7..26a53e2c 100644 --- a/mlem/contrib/fastai.py +++ b/mlem/contrib/fastai.py @@ -6,7 +6,7 @@ from pydantic import BaseModel from mlem.core.artifacts import Artifacts -from mlem.core.dataset_type import DatasetHook, DatasetSerializer, DatasetType +from mlem.core.data_type import DataHook, DataSerializer, DataType, DataWriter from mlem.core.hooks import IsInstanceHookMixin from mlem.core.model import BufferModelIO, ModelHook, ModelType, Signature from mlem.core.requirements import Requirements @@ -36,14 +36,16 @@ def process( return FastAIModel( methods={ "predict": Signature.from_method( - obj.predict, item=sample_data, auto_infer=True + obj.predict, + item=sample_data, + auto_infer=sample_data is not None, ) } ) class CategoryDataType( - DatasetType, DatasetSerializer, DatasetHook, IsInstanceHookMixin + DataType, DataSerializer, DataHook, IsInstanceHookMixin ): type: ClassVar = "fastai_category" valid_types: ClassVar = (Category,) @@ -65,12 +67,14 @@ def get_requirements(self) -> Requirements: def process(cls, obj: Any, **kwargs): return CategoryDataType(value=str(obj)) - def get_writer(self, **kwargs): + def get_writer( + self, project: str = None, filename: str = None, **kwargs + ) -> DataWriter: raise NotImplementedError # TODO class PILImageDataType( - DatasetType, DatasetSerializer, DatasetHook, IsInstanceHookMixin + DataType, DataSerializer, DataHook, IsInstanceHookMixin ): type: ClassVar = "fastai_pil_image" valid_types: ClassVar = (PILImage,) @@ -91,5 +95,7 @@ def get_requirements(self) -> Requirements: def process(cls, obj: Any, **kwargs): return PILImageDataType() - def get_writer(self, **kwargs): + def get_writer( + self, project: str = None, filename: str = None, **kwargs + ) -> DataWriter: raise NotImplementedError # TODO diff --git a/mlem/utils/module.py b/mlem/utils/module.py index 4cb3fddd..53bdd6b9 100644 --- a/mlem/utils/module.py +++ b/mlem/utils/module.py @@ -545,7 +545,9 @@ def add_requirement(self, obj_or_module): ) if parent_package_name not in self._modules: parent_package = sys.modules[parent_package_name] - self.add_requirement(parent_package) + # exclude namespace packages + if parent_package.__file__ is not None: + self.add_requirement(parent_package) def save(self, obj, save_persistent_id=True): if id(obj) in self.seen or isinstance(obj, IGNORE_TYPES_REQ):