From 710fcccf2073b4fb69636a2ea0157714e489c37d Mon Sep 17 00:00:00 2001 From: Piotr Synowiec Date: Mon, 21 Oct 2024 14:03:41 +0200 Subject: [PATCH 1/4] doc: wip --- CHANGELOG.md | 11 ++++++++ Makefile | 3 ++ README.md | 10 +------ Usage.md | 10 +++++++ docs/.gitignore | 1 + mysiar_data_flow/__init__.py | 6 ++++ mysiar_data_flow/data_flow.py | 52 +++++++++++++++++++++++------------ requirements.dev.txt | 3 +- 8 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 Usage.md create mode 100644 docs/.gitignore diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2ef0dca --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.0.1] - 2024-10-16 + +### Added +- initial version diff --git a/Makefile b/Makefile index b13ad83..83d13b6 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,6 @@ upload-test:: upload:: $(MAKE) build . venv/bin/activate && python -m twine upload -u $${PYPI_USER} -p $${PYPI_PASS} --verbose dist/* + +docs:: + venv/bin/pdoc mysiar_data_flow/ -o docs/ diff --git a/README.md b/README.md index be0f7d6..4a86d1d 100644 --- a/README.md +++ b/README.md @@ -21,17 +21,9 @@ library to manipulate data -## Installation instructions +## Installation ```sh pip install mysiar-data-flow ``` -## DataFlow.DataFrame - -### Usage -For now check [mysiar_data_flow/data_flow.py](mysiar_data_flow/data_flow.py) file for interface - - - -![work in progress](.github/5578703.png) diff --git a/Usage.md b/Usage.md new file mode 100644 index 0000000..1da0a52 --- /dev/null +++ b/Usage.md @@ -0,0 +1,10 @@ +# Usage + +## DataFlow.DataFrame + + +For now check [mysiar_data_flow/data_flow.py](https://github.com/mysiar-org/python-data-flow/blob/master/mysiar_data_flow/data_flow.py) file for interface + + + +![work in progress](.github/5578703.png) diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +* diff --git a/mysiar_data_flow/__init__.py b/mysiar_data_flow/__init__.py index b98e3ed..9f82913 100644 --- a/mysiar_data_flow/__init__.py +++ b/mysiar_data_flow/__init__.py @@ -1 +1,7 @@ +""" + .. include:: ../README.md + .. include:: ../Usage.md + .. include:: ../CHANGELOG.md +""" + from .data_flow import DataFlow diff --git a/mysiar_data_flow/data_flow.py b/mysiar_data_flow/data_flow.py index fc1994a..5352368 100644 --- a/mysiar_data_flow/data_flow.py +++ b/mysiar_data_flow/data_flow.py @@ -53,7 +53,7 @@ def __del__(self): if not self.__in_memory: delete_file(self.__filename) - def from_fireducks(self, df: fd.DataFrame): + def from_fireducks(self, df: fd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = df else: @@ -66,7 +66,7 @@ def to_fireducks(self) -> fd.DataFrame: else: return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type) - def from_pandas(self, df: pd.DataFrame): + def from_pandas(self, df: pd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(df) else: @@ -79,7 +79,7 @@ def to_pandas(self) -> pd.DataFrame: else: return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() - def from_polars(self, df: pl.DataFrame): + def from_polars(self, df: pl.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(df.to_pandas()) else: @@ -94,70 +94,70 @@ def to_polars(self) -> pl.DataFrame: to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() ) - def from_csv(self, filename: str): + def from_csv(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_csv(filename) else: from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_csv(self, filename: str, index=False): + def to_csv(self, filename: str, index=False) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_csv(filename, index=index) else: to_csv_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_feather(self, filename: str): + def from_feather(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(feather.read_feather(filename)) else: from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_feather(self, filename: str): + def to_feather(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_feather(filename) else: to_feather_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_parquet(self, filename: str): + def from_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_parquet(filename) else: from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_parquet(self, filename: str): + def to_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_parquet(filename) else: to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_json(self, filename: str): + def from_json(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_json(filename) else: from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_json(self, filename: str): + def to_json(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_json(filename) else: to_json_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_hdf(self, filename: str): + def from_hdf(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_hdf(filename) else: from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_hdf(self, filename: str, key: str = "key"): + def to_hdf(self, filename: str, key: str = "key") -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_hdf(path_or_buf=filename, key=key) else: @@ -165,12 +165,17 @@ def to_hdf(self, filename: str, key: str = "key"): return self def columns(self) -> list: + """ + lists columns in data frame + + :return: list - list of columns in data frame + """ if self.__in_memory: return self.__data.columns.to_list() else: return data_get_columns(tmp_filename=self.__filename, file_type=self.__file_type) - def columns_delete(self, columns: list): + def columns_delete(self, columns: list) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.drop(columns=columns, inplace=True) else: @@ -178,7 +183,13 @@ def columns_delete(self, columns: list): return self - def columns_rename(self, columns_mapping: dict): + def columns_rename(self, columns_mapping: dict) -> "DataFlow.DataFrame": + """ + rename columns + + :param columns_mapping: dict - old_name: new_name pairs ex. {"Year": "year", "Units": "units"} + :return: + """ if self.__in_memory: self.__data.rename(columns=columns_mapping, inplace=True) else: @@ -189,13 +200,19 @@ def columns_rename(self, columns_mapping: dict): ) return self - def columns_select(self, columns: list): + def columns_select(self, columns: list) -> "DataFlow.DataFrame": + """ + columns select - columns to keep in data frame + :param columns: + :return: + """ if self.__in_memory: self.__data = self.__data[columns] else: data_select_columns(tmp_filename=self.__filename, file_type=self.__file_type, columns=columns) + return self - def filter_on_column(self, column: str, value: Any, operator: Operator): + def filter_on_column(self, column: str, value: Any, operator: Operator) -> "DataFlow.DataFrame": if self.__in_memory: match operator: case Operator.Eq: @@ -218,3 +235,4 @@ def filter_on_column(self, column: str, value: Any, operator: Operator): value=value, operator=operator, ) + return self diff --git a/requirements.dev.txt b/requirements.dev.txt index 9440ad0..dc5b93e 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -4,4 +4,5 @@ pyproject-flake8 pytest pytest-cov poetry -twine \ No newline at end of file +twine +pdoc \ No newline at end of file From e50e96021a8f6552ee53a7480e3e38467c91c6f1 Mon Sep 17 00:00:00 2001 From: Piotr Synowiec Date: Mon, 21 Oct 2024 18:11:38 +0200 Subject: [PATCH 2/4] doc: update --- Makefile | 3 - Usage.md | 22 ++++++- docs/.gitignore | 1 - mysiar_data_flow/data_flow.py | 119 +++++++++++++++++++--------------- 4 files changed, 88 insertions(+), 57 deletions(-) delete mode 100644 docs/.gitignore diff --git a/Makefile b/Makefile index 83d13b6..b13ad83 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,3 @@ upload-test:: upload:: $(MAKE) build . venv/bin/activate && python -m twine upload -u $${PYPI_USER} -p $${PYPI_PASS} --verbose dist/* - -docs:: - venv/bin/pdoc mysiar_data_flow/ -o docs/ diff --git a/Usage.md b/Usage.md index 1da0a52..9fca1f2 100644 --- a/Usage.md +++ b/Usage.md @@ -3,7 +3,27 @@ ## DataFlow.DataFrame -For now check [mysiar_data_flow/data_flow.py](https://github.com/mysiar-org/python-data-flow/blob/master/mysiar_data_flow/data_flow.py) file for interface +Create empty data frame object in memory +```python + +from mysiar_data_flow import DataFlow + +df = DataFlow().DataFrame() +df.from_pandas(df=pandas_data_frame_obj) + +``` +Create data frame object in memory from Pandas data frame +```python + +from mysiar_data_flow import DataFlow + +df = DataFlow().DataFrame().from_pandas(df=pandas_data_frame_obj) +``` + + + +--- +For more check [mysiar_data_flow/data_flow.py](https://github.com/mysiar-org/python-data-flow/blob/master/mysiar_data_flow/data_flow.py) file for interface diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 72e8ffc..0000000 --- a/docs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* diff --git a/mysiar_data_flow/data_flow.py b/mysiar_data_flow/data_flow.py index 5352368..cc55821 100644 --- a/mysiar_data_flow/data_flow.py +++ b/mysiar_data_flow/data_flow.py @@ -53,6 +53,20 @@ def __del__(self): if not self.__in_memory: delete_file(self.__filename) + def from_csv(self, filename: str) -> "DataFlow.DataFrame": + if self.__in_memory: + self.__data = fd.read_csv(filename) + else: + from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + return self + + def from_feather(self, filename: str) -> "DataFlow.DataFrame": + if self.__in_memory: + self.__data = fd.from_pandas(feather.read_feather(filename)) + else: + from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + return self + def from_fireducks(self, df: fd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = df @@ -60,11 +74,19 @@ def from_fireducks(self, df: fd.DataFrame) -> "DataFlow.DataFrame": from_fireducks_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_fireducks(self) -> fd.DataFrame: + def from_hdf(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: - return self.__data + self.__data = fd.read_hdf(filename) else: - return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type) + from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + return self + + def from_json(self, filename: str) -> "DataFlow.DataFrame": + if self.__in_memory: + self.__data = fd.read_json(filename) + else: + from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + return self def from_pandas(self, df: pd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: @@ -73,11 +95,12 @@ def from_pandas(self, df: pd.DataFrame) -> "DataFlow.DataFrame": from_pandas_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_pandas(self) -> pd.DataFrame: + def from_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: - return self.__data.to_pandas() + self.__data = fd.read_parquet(filename) else: - return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() + from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + return self def from_polars(self, df: pl.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: @@ -86,21 +109,6 @@ def from_polars(self, df: pl.DataFrame) -> "DataFlow.DataFrame": from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_polars(self) -> pl.DataFrame: - if self.__in_memory: - return pl.from_pandas(self.__data.to_pandas()) - else: - return pl.from_pandas( - to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() - ) - - def from_csv(self, filename: str) -> "DataFlow.DataFrame": - if self.__in_memory: - self.__data = fd.read_csv(filename) - else: - from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) - return self - def to_csv(self, filename: str, index=False) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_csv(filename, index=index) @@ -108,13 +116,6 @@ def to_csv(self, filename: str, index=False) -> "DataFlow.DataFrame": to_csv_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_feather(self, filename: str) -> "DataFlow.DataFrame": - if self.__in_memory: - self.__data = fd.from_pandas(feather.read_feather(filename)) - else: - from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) - return self - def to_feather(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_feather(filename) @@ -122,25 +123,17 @@ def to_feather(self, filename: str) -> "DataFlow.DataFrame": to_feather_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_parquet(self, filename: str) -> "DataFlow.DataFrame": - if self.__in_memory: - self.__data = fd.read_parquet(filename) - else: - from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) - return self - - def to_parquet(self, filename: str) -> "DataFlow.DataFrame": + def to_fireducks(self) -> fd.DataFrame: if self.__in_memory: - self.__data.to_parquet(filename) + return self.__data else: - to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) - return self + return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type) - def from_json(self, filename: str) -> "DataFlow.DataFrame": + def to_hdf(self, filename: str, key: str = "key") -> "DataFlow.DataFrame": if self.__in_memory: - self.__data = fd.read_json(filename) + self.__data.to_hdf(path_or_buf=filename, key=key) else: - from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) + to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key) return self def to_json(self, filename: str) -> "DataFlow.DataFrame": @@ -150,20 +143,27 @@ def to_json(self, filename: str) -> "DataFlow.DataFrame": to_json_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_hdf(self, filename: str) -> "DataFlow.DataFrame": + def to_pandas(self) -> pd.DataFrame: if self.__in_memory: - self.__data = fd.read_hdf(filename) + return self.__data.to_pandas() else: - from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) - return self + return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() - def to_hdf(self, filename: str, key: str = "key") -> "DataFlow.DataFrame": + def to_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: - self.__data.to_hdf(path_or_buf=filename, key=key) + self.__data.to_parquet(filename) else: - to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key) + to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self + def to_polars(self) -> pl.DataFrame: + if self.__in_memory: + return pl.from_pandas(self.__data.to_pandas()) + else: + return pl.from_pandas( + to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() + ) + def columns(self) -> list: """ lists columns in data frame @@ -176,6 +176,12 @@ def columns(self) -> list: return data_get_columns(tmp_filename=self.__filename, file_type=self.__file_type) def columns_delete(self, columns: list) -> "DataFlow.DataFrame": + """ + deletes columns from data frame + + :param columns: list - list of columns to delete + :return: self + """ if self.__in_memory: self.__data.drop(columns=columns, inplace=True) else: @@ -188,7 +194,7 @@ def columns_rename(self, columns_mapping: dict) -> "DataFlow.DataFrame": rename columns :param columns_mapping: dict - old_name: new_name pairs ex. {"Year": "year", "Units": "units"} - :return: + :return: self """ if self.__in_memory: self.__data.rename(columns=columns_mapping, inplace=True) @@ -203,8 +209,9 @@ def columns_rename(self, columns_mapping: dict) -> "DataFlow.DataFrame": def columns_select(self, columns: list) -> "DataFlow.DataFrame": """ columns select - columns to keep in data frame - :param columns: - :return: + + :param columns: list - list of columns to select + :return: self """ if self.__in_memory: self.__data = self.__data[columns] @@ -213,6 +220,14 @@ def columns_select(self, columns: list) -> "DataFlow.DataFrame": return self def filter_on_column(self, column: str, value: Any, operator: Operator) -> "DataFlow.DataFrame": + """ + filters data on column + + :param column: str - column name + :param value: Any - value + :param operator: mysiar_data_flow.lib.Operator - filter operator + :return: self + """ if self.__in_memory: match operator: case Operator.Eq: From 7b1a4aa3ae77061078a2ab7dd64298e63e4c5f52 Mon Sep 17 00:00:00 2001 From: Piotr Synowiec Date: Mon, 21 Oct 2024 18:12:37 +0200 Subject: [PATCH 3/4] doc: version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 540603b..41747c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ license = {file = "LICENSE"} [tool.poetry] name = "mysiar-data-flow" -version = "0.0.2rc1" +version = "0.0.2" readme = "README.md" description = "Python data manipulation library" authors = ["Piotr Synowiec "] From 02457da32321ceea9bf7bf0cbd9ee68094757d41 Mon Sep 17 00:00:00 2001 From: Piotr Synowiec Date: Mon, 21 Oct 2024 18:16:23 +0200 Subject: [PATCH 4/4] doc: update changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ef0dca..e7deb3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.2] - 2024-10-21 + +### Added + +- typehints for self +- doc strings + ## [0.0.1] - 2024-10-16 ### Added + - initial version