From 9519d8a66d3e7d9cfba9920799dfaf3fc268efe5 Mon Sep 17 00:00:00 2001 From: Alec <30010253+alec-glisman@users.noreply.github.com> Date: Wed, 10 Apr 2024 21:17:05 -0700 Subject: [PATCH 1/5] initial commit --- .gitignore | 1 + alec-glisman/ML-Band-Gaps.md | 31 +++++++++++++++ alec-glisman/README.md | 77 ++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 alec-glisman/ML-Band-Gaps.md create mode 100644 alec-glisman/README.md diff --git a/.gitignore b/.gitignore index 2d4daa40..c86efdde 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .idea* +**/.DS_Store diff --git a/alec-glisman/ML-Band-Gaps.md b/alec-glisman/ML-Band-Gaps.md new file mode 100644 index 00000000..2ccb5adc --- /dev/null +++ b/alec-glisman/ML-Band-Gaps.md @@ -0,0 +1,31 @@ +# ML Band Gaps (Materials) + +> Ideal candidate: skilled ML data scientist with solid knowledge of materials science. + +# Overview + +The aim of this task is to create a python package that implements automatic prediction of electronic band gaps for a set of materials based on training data. + +# User story + +As a user of this software I can predict the value of an electronic band gap after passing training data and structural information about the target material. + +# Requirements + +- suggest the bandgap values for a set of materials designated by their crystallographic and stoichiometric properties +- the code shall be written in a way that can facilitate easy addition of other characteristics extracted from simulations (forces, pressures, phonon frequencies etc) + +# Expectations + +- the code shall be able to suggest realistic values for slightly modified geometry sets - eg. trained on Si and Ge it should suggest the value of bandgap for Si49Ge51 to be between those of Si and Ge +- modular and object-oriented implementation +- commit early and often - at least once per 24 hours + +# Timeline + +We leave exact timing to the candidate. Must fit Within 5 days total. + +# Notes + +- use a designated github repository for version control +- suggested source of training data: materialsproject.org diff --git a/alec-glisman/README.md b/alec-glisman/README.md new file mode 100644 index 00000000..4ee50e61 --- /dev/null +++ b/alec-glisman/README.md @@ -0,0 +1,77 @@ +# ReWoTes + +REal WOrld TEstS + +## Overview + +This repository contains example test assignments used during the hiring process at Mat3ra.com. Regular job interview questions can often be misleading, so use the "real-world" examples instead. + +Each file represents an assignment similar to what one would get when hired. + +| Focus | ReWote | Keywords | +| ---------------| --------------------------| ------------------------------- | +| Comp. Science | [Convergence Tracker](Convergence-Tracker.md) | Python, OOD, DFT, Planewaves | +| Comp. Science | [Basis Set Selector](Basis-Set-Selector.md) | Python, OOD, DFT, Local-orbital | +| Data. Science | [ML Property Predict](ML-Band-Gaps.md) | Python, ML Models, Scikit, Featurization | +| Front-End / UX | [Materials Designer](Materials-Designer.md) | ReactJS / UX Design, ThreeJS | +| Front-End / UX | [Flowchart Designer](Flowchart-Designer.md) | ReactJS / UX Design, DAG | +| Back-End / Ops | [Parallel Uploader](Parallel-File-Uploader.md) | Python, OOD, Threading, Objectstore | +| CI/CD, DevOps | [End-to-End Tests](End-to-End-Tests.md) | BDD tests, CI/CD workflows, Cypress | +| HPC, Cloud Inf | [Cloud HPC Bench.](Cloud-Infrastructure.md) | HPC Cluster, Linpack, Benchmarks | +| HPC, Containers| [Containerized HPC](Containerization-HPC.md) | HPC Cluster, Containers, Benchmarks | + +## Usage + +We suggest the following flow: + +1. [Fork](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) this repository on GitHub +2. Create a branch using your GitHub username as a branch name +3. Create a subfolder with your GitHub username +4. Copy one of the ReWoTe suggestions (`.md` files) to `README.md` in that subfolder and modify the content of the ReWoTe as necessary +5. Introduce any changes under the subfolder +6. Submit a [pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork) into the `dev` branch of this repository + +See [dev branch](https://github.com/Exabyte-io/rewotes/tree/dev) also. + +## Notes + +The examples listed here are only meant as guidelines and do not necessarily reflect the type of work to be performed at the company. Modifications to the individual assignments with advance notice are encouraged. + +We will screen for the ability to: + +1. pick up new concepts quickly +2. explain one's thoughts and process +3. implement a working proof-of-concept solution. + +We value attention to detail and modularity. + +## Who we look for + +We look for people who: + +1. Think like founders or want to be founders in the future, want to join a company at a very early stage, and understand the risk vs reward ratio. +2. Can learn quickly and have demonstrated it in the past. Advanced degrees help with this, but not a requirement. +3. Know their $hit and have measurable ways to prove it. Here, we can, for example, look at GitHub profiles (and compare with https://github.com/timurbazhirov), a list of publications (and compare with https://scholar.google.com/citations?user=7SxfHbMAAAAJ), writing skills, and the ability to consume technical knowledge. +4. Are honest and energetic. + +## Hiring process + +Our hiring process in more detail: + +| Stage | Target Duration | Topic | +| ----------------- | ----------------- | ------------------------------ | +| 0. Email screen | | why mat3ra.com / exabyte.io | +| 1. Phone screen | 15-20 min | career goals, basic skillset | +| 2. ReWoTe | 1-2h x 2-5 days | real-world work/thought process| +| 3. On-site meet | 3-4 x 30 min | personality fit | +| 4. Discuss offer | 30 min | cash/equity/benefits | +| 5. References | 2 x 15 min | sanity check | +| 6. Decision | | when to start | + +TOTAL: ~2 weeks tentative. + +## Contact info + +With any questions about this repository or our hiring process, please get in touch with us at info@mat3ra.com. + +© 2023 Exabyte Inc. / Mat3ra.com From 9525aedf4e118d017c2eee4551ae369a765bc4e5 Mon Sep 17 00:00:00 2001 From: Alec <30010253+alec-glisman@users.noreply.github.com> Date: Thu, 11 Apr 2024 19:07:14 -0700 Subject: [PATCH 2/5] first draft of data loading and preprocessing. Added unit tests --- .gitignore | 6 + alec-glisman/README.md | 81 ++------ alec-glisman/main.py | 22 +++ alec-glisman/requirements.yml | 31 +++ alec-glisman/src/data_load.py | 282 +++++++++++++++++++++++++++ alec-glisman/src/models.py | 15 ++ alec-glisman/tests/test_data_load.py | 82 ++++++++ 7 files changed, 454 insertions(+), 65 deletions(-) create mode 100644 alec-glisman/main.py create mode 100644 alec-glisman/requirements.yml create mode 100644 alec-glisman/src/data_load.py create mode 100644 alec-glisman/src/models.py create mode 100644 alec-glisman/tests/test_data_load.py diff --git a/.gitignore b/.gitignore index c86efdde..a1fb1675 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ .idea* + **/.DS_Store +*.pyc +*.hdf5 + +.vscode +api_key.txt diff --git a/alec-glisman/README.md b/alec-glisman/README.md index 4ee50e61..004e17b0 100644 --- a/alec-glisman/README.md +++ b/alec-glisman/README.md @@ -1,77 +1,28 @@ -# ReWoTes +# ReWoTes: ML Property Predict -REal WOrld TEstS +Alec Glisman ## Overview -This repository contains example test assignments used during the hiring process at Mat3ra.com. Regular job interview questions can often be misleading, so use the "real-world" examples instead. +This directory contains files for the ML Property Predict project for Mat3ra.com. -Each file represents an assignment similar to what one would get when hired. - -| Focus | ReWote | Keywords | -| ---------------| --------------------------| ------------------------------- | -| Comp. Science | [Convergence Tracker](Convergence-Tracker.md) | Python, OOD, DFT, Planewaves | -| Comp. Science | [Basis Set Selector](Basis-Set-Selector.md) | Python, OOD, DFT, Local-orbital | -| Data. Science | [ML Property Predict](ML-Band-Gaps.md) | Python, ML Models, Scikit, Featurization | -| Front-End / UX | [Materials Designer](Materials-Designer.md) | ReactJS / UX Design, ThreeJS | -| Front-End / UX | [Flowchart Designer](Flowchart-Designer.md) | ReactJS / UX Design, DAG | -| Back-End / Ops | [Parallel Uploader](Parallel-File-Uploader.md) | Python, OOD, Threading, Objectstore | -| CI/CD, DevOps | [End-to-End Tests](End-to-End-Tests.md) | BDD tests, CI/CD workflows, Cypress | -| HPC, Cloud Inf | [Cloud HPC Bench.](Cloud-Infrastructure.md) | HPC Cluster, Linpack, Benchmarks | -| HPC, Containers| [Containerized HPC](Containerization-HPC.md) | HPC Cluster, Containers, Benchmarks | +Input data is accessed from the Materials Project and the data is cleaned into Pandas Dataframes inside `data/data_load.py`. +The input data source to the machine learning model can be augmented with additional Materials Project data with the `MaterialData` init method and external data can also be merged using its respective `add_data_columns` method. +The cleaned data is archived using Pandas in conjunction with HDF5 to lower runtime costs for model development. ## Usage -We suggest the following flow: - -1. [Fork](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) this repository on GitHub -2. Create a branch using your GitHub username as a branch name -3. Create a subfolder with your GitHub username -4. Copy one of the ReWoTe suggestions (`.md` files) to `README.md` in that subfolder and modify the content of the ReWoTe as necessary -5. Introduce any changes under the subfolder -6. Submit a [pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork) into the `dev` branch of this repository - -See [dev branch](https://github.com/Exabyte-io/rewotes/tree/dev) also. - -## Notes - -The examples listed here are only meant as guidelines and do not necessarily reflect the type of work to be performed at the company. Modifications to the individual assignments with advance notice are encouraged. - -We will screen for the ability to: - -1. pick up new concepts quickly -2. explain one's thoughts and process -3. implement a working proof-of-concept solution. - -We value attention to detail and modularity. - -## Who we look for - -We look for people who: - -1. Think like founders or want to be founders in the future, want to join a company at a very early stage, and understand the risk vs reward ratio. -2. Can learn quickly and have demonstrated it in the past. Advanced degrees help with this, but not a requirement. -3. Know their $hit and have measurable ways to prove it. Here, we can, for example, look at GitHub profiles (and compare with https://github.com/timurbazhirov), a list of publications (and compare with https://scholar.google.com/citations?user=7SxfHbMAAAAJ), writing skills, and the ability to consume technical knowledge. -4. Are honest and energetic. - -## Hiring process - -Our hiring process in more detail: - -| Stage | Target Duration | Topic | -| ----------------- | ----------------- | ------------------------------ | -| 0. Email screen | | why mat3ra.com / exabyte.io | -| 1. Phone screen | 15-20 min | career goals, basic skillset | -| 2. ReWoTe | 1-2h x 2-5 days | real-world work/thought process| -| 3. On-site meet | 3-4 x 30 min | personality fit | -| 4. Discuss offer | 30 min | cash/equity/benefits | -| 5. References | 2 x 15 min | sanity check | -| 6. Decision | | when to start | +A Conda environment file has been provided (`requirements.yml`) to set up a Python environment called `ml-band-gaps` with the following command -TOTAL: ~2 weeks tentative. +```[bash] +conda env create -f requirements.yml +``` -## Contact info +The overall project can then be run with -With any questions about this repository or our hiring process, please get in touch with us at info@mat3ra.com. +```[bash] +python3 main.py +``` -© 2023 Exabyte Inc. / Mat3ra.com +Note that the data is sourced from the Materials Project, which requires an API key to access it. +I have added my API key to the `.gitignore` for security reasons, so users will need to generate their own and add it to an `api_key.txt` file. diff --git a/alec-glisman/main.py b/alec-glisman/main.py new file mode 100644 index 00000000..52d749cd --- /dev/null +++ b/alec-glisman/main.py @@ -0,0 +1,22 @@ +from pathlib import Path +from src.data_load import MaterialData + + +def main() -> None: + file_path = Path(__file__).resolve().parent + + # API key is not included in the code for security reasons + with open(file_path / "api_key.txt", "r", encoding="utf-8") as f: + api_key = f.read().strip() + + # Load data + data = MaterialData(api_key, band_gap=(0.0, 1000.0)) + x_train, x_test, y_train, y_test, _ = data.split_data() + + # Train models + + print("complete!") + + +if __name__ == "__main__": + main() diff --git a/alec-glisman/requirements.yml b/alec-glisman/requirements.yml new file mode 100644 index 00000000..b874f83e --- /dev/null +++ b/alec-glisman/requirements.yml @@ -0,0 +1,31 @@ +name: ml-band-gaps +channels: + - conda-forge +dependencies: + - pip + - tqdm + - numpy + - pandas + - pytables + - scipy + - scikit-learn + - xgboost + - matplotlib + - pymatgen + - phonopy + - ipykernel + - ipywidgets + - ipympl + - pandoc + - notebook + - jupyter_client + - pytest + - pytest-cov + - pytest-xdist + - coverage + - autopep8 + - black + - flake8 + - pip: + - "--editable=git+https://github.com/materialsproject/api.git@main#egg=mp-api" + \ No newline at end of file diff --git a/alec-glisman/src/data_load.py b/alec-glisman/src/data_load.py new file mode 100644 index 00000000..6b235ab7 --- /dev/null +++ b/alec-glisman/src/data_load.py @@ -0,0 +1,282 @@ +""" +This module provides a class for loading and processing material data from the +Materials Project API. + +The `MaterialData` class allows users to retrieve material data from the +Materials Project API, process and clean the data, and perform operations +such as splitting the data into train and test sets. + +Example: + ```python + from pathlib import Path + from src.data_load import MaterialData + + # load all materials with band gap between 0 and 1000 eV + data = MaterialData(api_key, band_gap=(0.0, 1000.0)) + data.get_data() + + # split data into train and test sets for band gap prediction + x_train, x_test, y_train, y_test, _ = data.split_data() + ``` + +Classes: + MaterialData: A class for loading and processing material data from the + Materials Project API. +""" + +from pathlib import Path + +from mp_api.client import MPRester +import pandas as pd +from sklearn.model_selection import train_test_split + + +class MaterialData: + """A class for loading and processing material data from the Materials + Project API. + + Extra fields from the Materials Project can be added with the `fields` + parameter in the constructor. + External data for each material can be added with the `add_data_columns` + method. + + Args: + api_key (str): The API key for accessing the material data. + fields (list, optional): The list of fields to retrieve from the + material data. Defaults to None. + save (bool, optional): Whether to save the loaded data. Defaults to + True. + **kwargs: Additional keyword arguments to be passed to the material + data API. + + Attributes: + api_key (str): The API key for accessing the material data. + fields (list): The list of fields to retrieve from the material data. + save (bool): Whether to save the loaded data. + kwargs (dict): Additional keyword arguments to be passed to the + material data API. + materials (list): The loaded material data. + dataframe (pd.DataFrame): The processed material data. + _dir_output (Path): The output directory for saving the data. + _file_data (Path): The file path for saving the data. + + Methods: + __init__: Initializes the MaterialData object. + __repr__: Returns a string representation of the MaterialData object. + __len__: Returns the number of rows in the material data. + _fetch_materials: Fetches the material data from the API. + get_materials: Returns the loaded material data. + get_data: Returns the processed material data. + split_data: Splits the material data into train and test sets. + add_data_columns: Adds additional columns to the material data. + _extract_data: Extracts and cleans the material data. + _encode_data: Encodes the categorical columns in the material data. + """ + + def __init__(self, api_key: str, fields: list = None, save: bool = True, **kwargs): + """ + Initialize the DataLoad object. + + Parameters: + - api_key (str): The API key for accessing the data. + - fields (list): The list of fields to retrieve from the data. + Defaults to a predefined list of fields. + - save (bool): Flag indicating whether to save the data. + Defaults to True. + - **kwargs: Additional keyword arguments. + + Raises: + - ValueError: If the API key is not provided. + """ + self.api_key: str = api_key + self.fields: list[str] = fields or [ + "material_id", + "composition_reduced", + "symmetry", + "structure", + "band_gap", + ] + self.save: bool = save + self.kwargs: dict = kwargs + + if not api_key: + raise ValueError("API key must be provided") + + self.materials: list = None + self.dataframe: pd.DataFrame = None + + self._dir_output: Path = Path("./data") + self._file_data: Path = self._dir_output / "materials_data.hdf5" + + def __repr__(self) -> str: + """Return a string representation of the MaterialData object. + + Returns: + str: A string representation of the MaterialData object. + """ + return ( + f"MaterialData(api_key={self.api_key}, fields={self.fields}" + + f", kwargs={self.kwargs})" + ) + + def __len__(self) -> int: + """Return the number of rows in the material data frame. + + Returns: + int: The number of rows in the material data frame. + """ + return len(self.dataframe) if self.dataframe is not None else 0 + + def _fetch_materials(self) -> None: + """Retrieve the material data from the Materials Project API.""" + with MPRester(self.api_key) as mpr: + self.materials = mpr.materials.summary.search( + fields=self.fields, **self.kwargs + ) + + def get_materials(self) -> list: + """Return the loaded Material Project API data. + + If the data has not been loaded, it will be fetched from the API. + + Returns: + list: Material Project data for each material. + """ + if self.materials is None: + self._fetch_materials() + return self.materials + + def get_data(self) -> pd.DataFrame: + """Return the processed and cleaned material data. + + If the data has been cached, it will be loaded from the file. + Otherwise, the data will be fetched from the API, cleaned, and + saved to the file. + + Returns: + pd.DataFrame: Material data + """ + + # load data if it exists + if self.dataframe is None and self._file_data.exists(): + self.dataframe = pd.read_hdf(self._file_data, key="data") + elif self.dataframe is None: + self._extract_data() + self._encode_data() + + if self.save: + self._dir_output.mkdir(exist_ok=True, parents=True) + self.dataframe.to_hdf(self._file_data, key="data", mode="w") + + return self.dataframe + + def split_data( + self, target: str = "band_gap", test_size: float = 0.2, seed: int = 42 + ) -> tuple: + """Split the material data into train and test sets. + + Parameters: + - target (str): The target column for prediction. Defaults to + "band_gap". + - test_size (float): The proportion of the data to include in the test + set. Defaults to 0.2. + - seed (int): The random seed for splitting the data. Defaults to 42. + + Returns: + - tuple: A tuple containing the train and test sets of the input + features and the target variable, as well as the material IDs. + """ + if self.dataframe is None: + self.get_data() + + # extract ID for later use + mpid = self.dataframe["id"] + + # test/train split + x = self.dataframe.drop(columns=[target, "id"]) + y = self.dataframe[target] + x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=test_size, random_state=seed + ) + + return x_train, x_test, y_train, y_test, mpid + + def add_data_columns(self, data: dict) -> None: + """Add additional columns to the material data. + + Note that the data will be added to the existing data frame and it is + assumed that the data is already encoded if necessary. + + Parameters: + - data (dict): A dictionary of additional columns to add to the data. + """ + if self.dataframe is None: + self._extract_data() + self._encode_data() + + self.dataframe = self.dataframe.assign(**data) + + if self.save: + self.dataframe.to_hdf(self._file_data, key="data", mode="w") + + def _extract_data(self) -> pd.DataFrame: + """Extract and clean the material data from the API into a DataFrame + for analysis. + + Returns: + pd.DataFrame: The cleaned material data. + """ + if self.materials is None: + self._fetch_materials() + + cleaned_data = [] + for doc in self.materials: + # extract subset of symmetry data + keys = ["crystal_system", "symbol", "point_group"] + d = doc.symmetry.dict() + symmetry = dict((k, d[k]) for k in keys) + symmetry["crystal_system"] = symmetry["crystal_system"].value + + # extract subset of structure data + lattice = doc.structure.lattice + structure = { + "a": lattice.a, + "b": lattice.b, + "c": lattice.c, + "alpha": lattice.alpha, + "beta": lattice.beta, + "gamma": lattice.gamma, + "density": doc.structure.density, + } + + # combine dicts + data = { + **{"id": doc.material_id.split("()")[0]}, + **doc.composition_reduced.as_dict(), + **symmetry, + **structure, + **{"band_gap": doc.band_gap}, + } + cleaned_data.append(data) + + # convert list of dicts to pandas, and fill missing values in elements + self.dataframe = pd.DataFrame(cleaned_data).fillna(0) + return self.dataframe + + def _encode_data(self) -> pd.DataFrame: + """Encode the categorical columns in the material data. + + Returns: + pd.DataFrame: The encoded material data. + """ + if self.dataframe is None: + self._extract_data() + + # one-hot encoding for categorical columns + self.dataframe = pd.get_dummies( + self.dataframe, + columns=["crystal_system", "point_group", "symbol"], + drop_first=True, + ) + + return self.dataframe diff --git a/alec-glisman/src/models.py b/alec-glisman/src/models.py new file mode 100644 index 00000000..145a052d --- /dev/null +++ b/alec-glisman/src/models.py @@ -0,0 +1,15 @@ +import numpy as np + +from scipy.stats import uniform, randint + +from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine +from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error +from sklearn.model_selection import ( + cross_val_score, + GridSearchCV, + KFold, + RandomizedSearchCV, + train_test_split, +) + +import xgboost as xgb diff --git a/alec-glisman/tests/test_data_load.py b/alec-glisman/tests/test_data_load.py new file mode 100644 index 00000000..e8f1b0f5 --- /dev/null +++ b/alec-glisman/tests/test_data_load.py @@ -0,0 +1,82 @@ +"""Test the MaterialData class + +This module contains tests for the MaterialData class in the data +loading module. + +Example: + To test the MaterialData class, run the following command: + + $ pytest tests/test_data_load.py +""" + +import sys +from pathlib import Path +import pytest + +import pandas as pd + +sys.path.append(str(Path(__file__).resolve().parents[1])) +from src.data_load import MaterialData # noqa: E402 + + +@pytest.fixture(scope="module") +def api_key() -> str: + """Read the API key from a file""" + with open("api_key.txt", "r", encoding="utf-8") as f: + api = f.read().strip() + return api + + +@pytest.fixture(scope="module") +def data(api_key: str) -> MaterialData: + """Return a MaterialData object""" + return MaterialData(api_key, save=False, band_gap=(0.5, 0.55)) + + +class TestMaterialData: + def test_empty_init(self) -> None: + """Expect a ValueError when no arguments are passed""" + with pytest.raises(TypeError): + MaterialData() + + def test_bad_init(self) -> None: + """Expect a ValueError when an invalid argument is passed""" + with pytest.raises(ValueError): + MaterialData(24512) + + def test_init(self, data: MaterialData) -> None: + """Expect a MaterialData object to be created""" + assert data is not None + + # Check that the attributes are set correctly + assert data.materials is None + assert data.dataframe is None + assert len(data) == 0 + + def test_repr(self, data: MaterialData) -> None: + """Expect the __repr__ method to return a string""" + assert isinstance(repr(data), str) + + def test_get_materials(self, data: MaterialData) -> None: + """Expect the material data to be fetched, but not saved""" + materials = data.get_materials() + + # check that the materials are fetched + assert materials is not None + assert data.materials is not None + assert len(materials) > 0 + + def test_get_data(self, data: MaterialData) -> None: + """Expect the material data to be fetched, cleaned, and not saved""" + data._file_data = Path("temp/materials_data.hdf5") + df = data.get_data() + + # check that the data is fetched and cleaned + assert df is not None + assert data.dataframe is not None + assert len(data) > 0 + assert len(data) == len(df) + assert isinstance(data.dataframe, pd.DataFrame) + + # check that the output file is not created + assert not data._file_data.exists() From 3bda902b2a1d68238475be6f692fd1766326c98b Mon Sep 17 00:00:00 2001 From: Alec <30010253+alec-glisman@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:00:43 -0700 Subject: [PATCH 3/5] XGBoost models stable and accurate --- .gitignore | 4 +- alec-glisman/README.md | 34 +++++- alec-glisman/main.py | 40 ++++++- alec-glisman/src/data_load.py | 4 +- alec-glisman/src/models.py | 212 ++++++++++++++++++++++++++++++++-- 5 files changed, 275 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index a1fb1675..821015bd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ **/.DS_Store *.pyc -*.hdf5 + +**/data/* +**/models/* .vscode api_key.txt diff --git a/alec-glisman/README.md b/alec-glisman/README.md index 004e17b0..0e2f9415 100644 --- a/alec-glisman/README.md +++ b/alec-glisman/README.md @@ -10,19 +10,49 @@ Input data is accessed from the Materials Project and the data is cleaned into P The input data source to the machine learning model can be augmented with additional Materials Project data with the `MaterialData` init method and external data can also be merged using its respective `add_data_columns` method. The cleaned data is archived using Pandas in conjunction with HDF5 to lower runtime costs for model development. +The best XGBoost Regressor that I trained is saved during runtime under the `models` directory and has an MSE of 0.700 eV. +The seed used is provided in `main.py` for reproducibility. + +Areas for future work include: + +1. Stratified sampling for test/train split or cross-validation to make sure different space groups are represented properly in each subset. +2. Explore the use of feed-forward neural networks and experiment with architecture/drop-out to optimize the performance. +3. Addition of more data from the Materials Project to lower the inductive bias of the models. +4. Attempt transfer-learning of these models and fine-tune to more specific databases, such as silicon semiconductors. + ## Usage A Conda environment file has been provided (`requirements.yml`) to set up a Python environment called `ml-band-gaps` with the following command ```[bash] -conda env create -f requirements.yml +$ conda env create -f requirements.yml ``` The overall project can then be run with ```[bash] -python3 main.py +$ python3 main.py +``` + +Unit tests can be run with pytest as + +```[bash] +$ pytest tests ``` +Data ingested is cached to the `data` directory, and machine-learning models are cached to the `models` directory. +Each of these directories is created automatically as part of the main script. + Note that the data is sourced from the Materials Project, which requires an API key to access it. I have added my API key to the `.gitignore` for security reasons, so users will need to generate their own and add it to an `api_key.txt` file. + +## Requirements + +- suggest the bandgap values for a set of materials designated by their crystallographic and stoichiometric properties +- the code shall be written in a way that can facilitate easy addition of other characteristics extracted from simulations (forces, pressures, phonon frequencies etc.) + +## Expectations + +- the code shall be able to suggest realistic values for slightly modified geometry sets - e.g. trained on Si and Ge it should suggest the value of bandgap for Si49Ge51 to be between those of Si and Ge +- modular and object-oriented implementation +- commit early and often - at least once per 24 hours diff --git a/alec-glisman/main.py b/alec-glisman/main.py index 52d749cd..4196c4f9 100644 --- a/alec-glisman/main.py +++ b/alec-glisman/main.py @@ -1,21 +1,55 @@ +"""Main script that trains models using the XGBoostModels class. + +The main function in this script is `main()`, which is responsible for +executing the script. It follows the steps mentioned above and does not +return any value. + +To run this script, execute the `main()` function. + +Example: + python main.py + +Note: Before running the script, make sure to provide the API key in a file +named "api_key.txt" located in the same directory as this script. +""" + from pathlib import Path + from src.data_load import MaterialData +from src.models import XGBoostModels def main() -> None: + """ + Main function that executes the script. + + This function performs the following steps: + 1. Reads the API key from a file. + 2. Loads data using the MaterialData class. + 3. Splits the data into training and testing sets. + 4. Trains models using the XGBoostModels class. + 5. Prints a completion message. + + Returns: + None + """ file_path = Path(__file__).resolve().parent + seed = 42 # API key is not included in the code for security reasons with open(file_path / "api_key.txt", "r", encoding="utf-8") as f: api_key = f.read().strip() # Load data - data = MaterialData(api_key, band_gap=(0.0, 1000.0)) - x_train, x_test, y_train, y_test, _ = data.split_data() + data = MaterialData(api_key, band_gap=(0.0, 10.0)) + x_train, x_test, y_train, y_test, _ = data.split_data(seed=seed) # Train models + xgb = XGBoostModels(x_train, y_train, x_test, y_test, save=True) + xgb.train_models(seed=seed) - print("complete!") + # Notify user that the script has finished + print("Script completed successfully.") if __name__ == "__main__": diff --git a/alec-glisman/src/data_load.py b/alec-glisman/src/data_load.py index 6b235ab7..5d1f52ac 100644 --- a/alec-glisman/src/data_load.py +++ b/alec-glisman/src/data_load.py @@ -99,7 +99,7 @@ def __init__(self, api_key: str, fields: list = None, save: bool = True, **kwarg self.save: bool = save self.kwargs: dict = kwargs - if not api_key: + if not isinstance(self.api_key, str) or not self.api_key: raise ValueError("API key must be provided") self.materials: list = None @@ -196,7 +196,7 @@ def split_data( x = self.dataframe.drop(columns=[target, "id"]) y = self.dataframe[target] x_train, x_test, y_train, y_test = train_test_split( - x, y, test_size=test_size, random_state=seed + x, y, test_size=test_size, random_state=seed, shuffle=True ) return x_train, x_test, y_train, y_test, mpid diff --git a/alec-glisman/src/models.py b/alec-glisman/src/models.py index 145a052d..8718820d 100644 --- a/alec-glisman/src/models.py +++ b/alec-glisman/src/models.py @@ -1,15 +1,205 @@ -import numpy as np +from pathlib import Path +import numpy as np +import pandas as pd from scipy.stats import uniform, randint - -from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine -from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error -from sklearn.model_selection import ( - cross_val_score, - GridSearchCV, - KFold, - RandomizedSearchCV, - train_test_split, +from sklearn.metrics import ( + auc, + accuracy_score, + confusion_matrix, + mean_squared_error, ) - +from sklearn.model_selection import KFold, RandomizedSearchCV import xgboost as xgb + + +class MaterialsModels: + """A class for training and evaluating materials models. + + This class provides a template for training and evaluating machine learning + models on materials data. It includes methods for training models with + cross-validation and randomized search for hyperparameter tuning, as well + as evaluating the trained models. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and metrics. + Defaults to True. + + Attributes: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool): Whether to save the trained model and metrics. + metrics (dict): The evaluation metrics. + model: The trained model. + _dir_out (Path): The output directory for saving the model and metrics. + + Raises: + NotImplementedError: If the train_models method is not implemented. + + """ + + def __init__( + self, + x_train: np.ndarray, + y_train: np.ndarray, + x_test: np.ndarray, + y_test: np.ndarray, + save: bool = True, + ) -> None: + self.x_train = x_train + self.y_train = y_train + self.x_test = x_test + self.y_test = y_test + self.save = save + + self.metrics = None + self.model = None + + self._dir_out = Path("./models") + + def train_models( + self, param_grid: dict = None, cv: int = 10, seed: int = 42 + ) -> dict: + """Train XGBoost models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 5. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + + Returns: + dict: The trained models. + + Raises: + NotImplementedError: If the method is not implemented. + """ + raise NotImplementedError("Method not implemented") + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + y_pred = self.model.predict(self.x_test) + accuracy = accuracy_score(self.y_test, y_pred) + confusion = confusion_matrix(self.y_test, y_pred) + mse = mean_squared_error(self.y_test, y_pred) + aucp = auc(self.y_test, y_pred) + + self.metrics = { + "accuracy": accuracy, + "confusion_matrix": confusion, + "mean_squared_error": mse, + "root_mean_squared_error": np.sqrt(mse), + "auc": aucp, + } + return self.metrics + + +class XGBoostModels(MaterialsModels): + """ + This class represents a set of XGBoost models for materials data. + + It inherits from the MaterialsModels base class and implements the + train_models method specifically for XGBoost models. It uses + cross-validation and grid search for training. + + Attributes: + model: The trained model. + metrics: The evaluation metrics for the model. + x_train: The training data. + y_train: The training labels. + x_test: The test data. + y_test: The test labels. + """ + + def train_models( + self, param_grid: dict = None, cv: int = 10, seed: int = 42 + ) -> xgb.XGBRegressor: + """Train XGBoost models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 5. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + + Returns: + xgb.XGBRegressor: The best trained XGBoost model. + """ + if param_grid is None: + param_grid = { + "n_estimators": randint(100, 1000), + "max_depth": randint(3, 10), + "subsample": uniform(0.5, 0.5), + "colsample_bytree": uniform(0.5, 0.5), + } + + model = xgb.XGBRegressor( + n_estimators=1000, + max_depth=7, + learning_rate=0.1, + colsample_bytree=0.8, + subsample=0.8, + n_jobs=1, + ) + kfold = KFold(n_splits=cv, random_state=seed, shuffle=True) + random_search = RandomizedSearchCV( + model, + param_distributions=param_grid, + n_iter=10, + scoring="neg_mean_squared_error", + n_jobs=-1, + cv=kfold.split(self.x_train, self.y_train), + verbose=3, + random_state=seed, + ) + random_search.fit(self.x_train, self.y_train) + best_model = random_search.best_estimator_ + self.model = best_model + + params = random_search.best_params_ + print(f"Best Model:\n{best_model}") + + # train the best model on the full training set + best_model.fit(self.x_train, self.y_train) + y_pred = best_model.predict(self.x_test) + mse = mean_squared_error(self.y_test, y_pred) + print(f"Mean Squared Error: {mse}") + + if self.save: + self._dir_out.mkdir(exist_ok=True) + best_model.save_model(self._dir_out / "xgboost_model.json") + pd.DataFrame(params, index=[0]).to_csv( + self._dir_out / "xgboost_params.csv", index=False + ) + + return best_model + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + super().evaluate_model() + + if self.save: + self._dir_out.mkdir(exist_ok=True) + pd.DataFrame(self.metrics, index=[0]).to_csv( + self._dir_out / "xgboost_metrics.csv", index=False + ) + + return self.metrics From a5477157fee414221da8a28196fa156688bf918a Mon Sep 17 00:00:00 2001 From: Alec <30010253+alec-glisman@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:57:37 -0700 Subject: [PATCH 4/5] neural networks stable and training --- alec-glisman/main.py | 9 +- alec-glisman/requirements.yml | 4 + alec-glisman/src/data_load.py | 24 +++- alec-glisman/src/models.py | 208 +++++++++++++++++++++++++++++++++- 4 files changed, 236 insertions(+), 9 deletions(-) diff --git a/alec-glisman/main.py b/alec-glisman/main.py index 4196c4f9..ae7ecba1 100644 --- a/alec-glisman/main.py +++ b/alec-glisman/main.py @@ -16,7 +16,7 @@ from pathlib import Path from src.data_load import MaterialData -from src.models import XGBoostModels +from src.models import XGBoostModels, NeuralNetModels def main() -> None: @@ -28,7 +28,8 @@ def main() -> None: 2. Loads data using the MaterialData class. 3. Splits the data into training and testing sets. 4. Trains models using the XGBoostModels class. - 5. Prints a completion message. + 5. Trains models using a Neural Network. + 6. Prints a completion message. Returns: None @@ -42,11 +43,13 @@ def main() -> None: # Load data data = MaterialData(api_key, band_gap=(0.0, 10.0)) - x_train, x_test, y_train, y_test, _ = data.split_data(seed=seed) + x_train, x_test, y_train, y_test, _, _ = data.split_data(seed=seed) # Train models xgb = XGBoostModels(x_train, y_train, x_test, y_test, save=True) xgb.train_models(seed=seed) + nn = NeuralNetModels(x_train, y_train, x_test, y_test, save=True) + nn.train_models(seed=seed) # Notify user that the script has finished print("Script completed successfully.") diff --git a/alec-glisman/requirements.yml b/alec-glisman/requirements.yml index b874f83e..367fc381 100644 --- a/alec-glisman/requirements.yml +++ b/alec-glisman/requirements.yml @@ -4,12 +4,16 @@ channels: dependencies: - pip - tqdm + - joblib - numpy - pandas - pytables - scipy - scikit-learn - xgboost + - pytorch + - torchvision + - skorch - matplotlib - pymatgen - phonopy diff --git a/alec-glisman/src/data_load.py b/alec-glisman/src/data_load.py index 5d1f52ac..5e4e36d3 100644 --- a/alec-glisman/src/data_load.py +++ b/alec-glisman/src/data_load.py @@ -26,9 +26,12 @@ from pathlib import Path -from mp_api.client import MPRester +import joblib import pandas as pd from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +from mp_api.client import MPRester class MaterialData: @@ -184,7 +187,8 @@ def split_data( Returns: - tuple: A tuple containing the train and test sets of the input - features and the target variable, as well as the material IDs. + features and the target variable, as well as the material IDs, and the + scaler used to scale the data. """ if self.dataframe is None: self.get_data() @@ -193,13 +197,23 @@ def split_data( mpid = self.dataframe["id"] # test/train split - x = self.dataframe.drop(columns=[target, "id"]) - y = self.dataframe[target] + x = self.dataframe.drop(columns=[target, "id"]).to_numpy() + y = self.dataframe[target].to_numpy().reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=seed, shuffle=True ) - return x_train, x_test, y_train, y_test, mpid + # scale the training and testing data + scaler = StandardScaler() + x_train = scaler.fit_transform(x_train) + x_test = scaler.transform(x_test) + + # save the scaler + if self.save: + scaler_filename = self._dir_output / "scaler.save" + joblib.dump(scaler, scaler_filename) + + return x_train, x_test, y_train, y_test, mpid, scaler def add_data_columns(self, data: dict) -> None: """Add additional columns to the material data. diff --git a/alec-glisman/src/models.py b/alec-glisman/src/models.py index 8718820d..f3ad755a 100644 --- a/alec-glisman/src/models.py +++ b/alec-glisman/src/models.py @@ -10,6 +10,9 @@ mean_squared_error, ) from sklearn.model_selection import KFold, RandomizedSearchCV +from skorch import NeuralNetRegressor +import torch +from torch import nn import xgboost as xgb @@ -52,6 +55,17 @@ def __init__( y_test: np.ndarray, save: bool = True, ) -> None: + """Initialize the MaterialsModels class. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and + metrics. Defaults to True. + """ + self.x_train = x_train self.y_train = y_train self.x_test = x_test @@ -132,7 +146,7 @@ def train_models( param_grid (dict): The parameter grid for the grid search. Defaults to None. cv (int, optional): The number of cross-validation folds. - Defaults to 5. + Defaults to 10. seed (int, optional): The random seed for the train-test split. Defaults to 42. @@ -203,3 +217,195 @@ def evaluate_model(self) -> dict: ) return self.metrics + + +class NeuralNetModels(MaterialsModels): + """ + This class represents a set of PyTorch models for materials data. + + It inherits from the MaterialsModels base class and implements the + train_models method specifically for PyTorch models. It uses + cross-validation and grid search for training. + + Attributes: + model: The trained model. + metrics: The evaluation metrics for the model. + x_train: The training data. + y_train: The training labels. + x_test: The test data. + y_test: The test labels. + """ + + class Net(nn.Module): + """A simple feedforward neural network. + + Args: + nn (Module): The PyTorch neural network module. + """ + + def __init__( + self, + input_size, + hidden_size, + output_size, + num_layers, + dropout, + activation=nn.ReLU(), + ): + """Initialize the neural network. + + Args: + input_size (int): The size of the input layer. + hidden_size (int): The size of the hidden layers. + output_size (int): The size of the output layer. + num_layers (int): The number of hidden layers. + dropout (float): The dropout rate. + activation (nn.Module, optional): The activation function. + Defaults to nn.ReLU(). + """ + + super(NeuralNetModels.Net, self).__init__() + # make a list of hidden layers + layers = [] + # input layer + layers.append(nn.Linear(input_size, hidden_size)) + layers.append(activation) + layers.append(nn.Dropout(dropout)) + # hidden layers + for _ in range(1, num_layers): + layers.append(nn.Linear(hidden_size, hidden_size)) + layers.append(activation) + layers.append(nn.Dropout(dropout)) + # output layer + layers.append(nn.Linear(hidden_size, output_size)) + self.net = nn.Sequential(*layers) + + def forward(self, x): + """Forward pass of the neural network. + + Args: + x (torch.Tensor): The input data. + """ + return self.net(x) + + def __init__( + self, + x_train: np.ndarray, + y_train: np.ndarray, + x_test: np.ndarray, + y_test: np.ndarray, + save: bool = True, + ) -> None: + """Initialize the MaterialsModels class. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and + metrics. Defaults to True. + """ + super().__init__(x_train, y_train, x_test, y_test, save) + + # convert x and y to torch tensors of type float + self.x_train = torch.tensor(x_train, dtype=torch.float32) + self.y_train = torch.tensor(y_train, dtype=torch.float32) + self.x_test = torch.tensor(x_test, dtype=torch.float32) + self.y_test = torch.tensor(y_test, dtype=torch.float32) + + def train_models( + self, + param_grid: dict = None, + cv: int = 3, + seed: int = 42, + epochs: int = 50, + ) -> NeuralNetRegressor: + """Train PyTorch models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 3. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + epochs (int, optional): The number of training epochs. + Defaults to 50. + + Returns: + NeuralNetRegressor: The best trained PyTorch model. + """ + if param_grid is None: + param_grid = { + "module__hidden_size": randint(20, 250), + "module__num_layers": randint(4, 7), + "module__dropout": uniform(0.0, 0.2), + "lr": uniform(0.001, 0.1), + } + input_size = self.x_train.shape[1] + output_size = self.y_train.shape[1] + + torch.manual_seed(seed) + net = NeuralNetRegressor( + module=self.Net, + module__input_size=input_size, + module__hidden_size=15, + module__output_size=output_size, + module__num_layers=4, + module__dropout=0.01, + max_epochs=epochs, + lr=0.1, + optimizer=torch.optim.SGD, + ) + + kfold = KFold(n_splits=cv, random_state=seed, shuffle=True) + random_search = RandomizedSearchCV( + net, + param_distributions=param_grid, + n_iter=2, + scoring="neg_mean_squared_error", + n_jobs=-1, + cv=kfold.split(self.x_train, self.y_train), + verbose=3, + random_state=seed, + ) + random_search.fit(self.x_train, self.y_train) + best_model = random_search.best_estimator_ + self.model = best_model + + params = random_search.best_params_ + print(f"Best Model:\n{best_model}") + + # train the best model on the full training set + best_model.fit(self.x_train, self.y_train) + y_pred = best_model.predict(self.x_test) + mse = mean_squared_error(self.y_test, y_pred) + print(f"Mean Squared Error: {mse}") + + if self.save: + self._dir_out.mkdir(exist_ok=True) + filename = self._dir_out / "neural_network_model.pkl" + with open(filename, "wb") as f: + torch.save(best_model, f) + pd.DataFrame(params, index=[0]).to_csv( + self._dir_out / "neural_network_params.csv", index=False + ) + + return self.model + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + super().evaluate_model() + + if self.save: + self._dir_out.mkdir(exist_ok=True) + pd.DataFrame(self.metrics, index=[0]).to_csv( + self._dir_out / "neural_network_metrics.csv", index=False + ) + + return self.metrics From ecb4dc3a46c68913f8c5833e7db8ff859852229d Mon Sep 17 00:00:00 2001 From: Alec <30010253+alec-glisman@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:02:54 -0700 Subject: [PATCH 5/5] Add model evaluation for XGBoost and NeuralNetModels. Further documentation added --- alec-glisman/README.md | 14 ++++-- alec-glisman/main.py | 2 + alec-glisman/src/data_load.py | 14 +++++- alec-glisman/src/models.py | 90 ++++++++++++++++++++++------------- 4 files changed, 81 insertions(+), 39 deletions(-) diff --git a/alec-glisman/README.md b/alec-glisman/README.md index 0e2f9415..28817594 100644 --- a/alec-glisman/README.md +++ b/alec-glisman/README.md @@ -7,16 +7,24 @@ Alec Glisman This directory contains files for the ML Property Predict project for Mat3ra.com. Input data is accessed from the Materials Project and the data is cleaned into Pandas Dataframes inside `data/data_load.py`. +I chose to download all materials with a bandgap of less than 10 eV from the Materials Project and parsed all data related to the crystallographic and stoichiometric properties. +Categorical data is converted to numeric data using one-hot encoding and the data is then scaled using `sklearn.preprocessing.StandardScaler`. The input data source to the machine learning model can be augmented with additional Materials Project data with the `MaterialData` init method and external data can also be merged using its respective `add_data_columns` method. The cleaned data is archived using Pandas in conjunction with HDF5 to lower runtime costs for model development. -The best XGBoost Regressor that I trained is saved during runtime under the `models` directory and has an MSE of 0.700 eV. +I chose to pursue two machine-learning architectures: XGBoost and feed-forward, fully connected, neural networks. +XGBoost generally performs better than neural networks when the data set is not large, and XGBoost is also much faster to train. +Neural networks were included for their superior expressivity and serve as a useful comparison to XGBoost. +In both cases, I employed `KFold` and `RandomizedSearchCV` from `scikit-learn` to cross-validate and select hyperparameters, respectively. + +The best XGBoost Regressor that I trained is saved during runtime under the `models` directory and has a testing sample MSE of 0.646 eV. +Similarly, the best fully connected neural network I trained is saved during runtime under the `models` directory and has a testing sample MSE of 0.817 eV. The seed used is provided in `main.py` for reproducibility. Areas for future work include: 1. Stratified sampling for test/train split or cross-validation to make sure different space groups are represented properly in each subset. -2. Explore the use of feed-forward neural networks and experiment with architecture/drop-out to optimize the performance. +2. Explore the use of feed-forward neural networks and experiment with architecture, drop-out, and regularization to optimize the performance. Additionally, increase the epochs from 40. I used 40 due to computational constraints, but the loss was still noticeably shrinking. 3. Addition of more data from the Materials Project to lower the inductive bias of the models. 4. Attempt transfer-learning of these models and fine-tune to more specific databases, such as silicon semiconductors. @@ -31,7 +39,7 @@ $ conda env create -f requirements.yml The overall project can then be run with ```[bash] -$ python3 main.py +$ python main.py ``` Unit tests can be run with pytest as diff --git a/alec-glisman/main.py b/alec-glisman/main.py index ae7ecba1..33cf4fb1 100644 --- a/alec-glisman/main.py +++ b/alec-glisman/main.py @@ -48,8 +48,10 @@ def main() -> None: # Train models xgb = XGBoostModels(x_train, y_train, x_test, y_test, save=True) xgb.train_models(seed=seed) + xgb.evaluate_model() nn = NeuralNetModels(x_train, y_train, x_test, y_test, save=True) nn.train_models(seed=seed) + nn.evaluate_model() # Notify user that the script has finished print("Script completed successfully.") diff --git a/alec-glisman/src/data_load.py b/alec-glisman/src/data_load.py index 5e4e36d3..6a9a1f11 100644 --- a/alec-glisman/src/data_load.py +++ b/alec-glisman/src/data_load.py @@ -27,6 +27,7 @@ from pathlib import Path import joblib +import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -196,9 +197,18 @@ def split_data( # extract ID for later use mpid = self.dataframe["id"] + x = self.dataframe.drop(columns=[target, "id"]).to_numpy(dtype=np.float64) + y = self.dataframe[target].to_numpy(dtype=np.float64).reshape(-1, 1) + + # drop rows with NaN entries in x or y + mask_x = np.isnan(x).any(axis=1) + mask_y = np.isnan(y).flatten() + mask = mask_x | mask_y + x = x[~mask] + y = y[~mask] + mpid = mpid[~mask] + # test/train split - x = self.dataframe.drop(columns=[target, "id"]).to_numpy() - y = self.dataframe[target].to_numpy().reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=seed, shuffle=True ) diff --git a/alec-glisman/src/models.py b/alec-glisman/src/models.py index f3ad755a..fc710618 100644 --- a/alec-glisman/src/models.py +++ b/alec-glisman/src/models.py @@ -1,14 +1,21 @@ +""" +This module contains classes for training and evaluating materials models. + +The module includes the following classes: +- MaterialsModels: A base class for training and evaluating machine learning +models on materials data. +- XGBoostModels: A class for training and evaluating XGBoost models on +materials data. +- NeuralNetModels: A class for training and evaluating PyTorch models on +materials data. +""" + from pathlib import Path import numpy as np import pandas as pd from scipy.stats import uniform, randint -from sklearn.metrics import ( - auc, - accuracy_score, - confusion_matrix, - mean_squared_error, -) +from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold, RandomizedSearchCV from skorch import NeuralNetRegressor import torch @@ -105,17 +112,11 @@ def evaluate_model(self) -> dict: dict: The evaluation metrics. """ y_pred = self.model.predict(self.x_test) - accuracy = accuracy_score(self.y_test, y_pred) - confusion = confusion_matrix(self.y_test, y_pred) mse = mean_squared_error(self.y_test, y_pred) - aucp = auc(self.y_test, y_pred) self.metrics = { - "accuracy": accuracy, - "confusion_matrix": confusion, "mean_squared_error": mse, "root_mean_squared_error": np.sqrt(mse), - "auc": aucp, } return self.metrics @@ -138,7 +139,7 @@ class XGBoostModels(MaterialsModels): """ def train_models( - self, param_grid: dict = None, cv: int = 10, seed: int = 42 + self, param_grid: dict = None, cv: int = 5, seed: int = 42 ) -> xgb.XGBRegressor: """Train XGBoost models with cross-validation and grid search. @@ -146,7 +147,7 @@ def train_models( param_grid (dict): The parameter grid for the grid search. Defaults to None. cv (int, optional): The number of cross-validation folds. - Defaults to 10. + Defaults to 5. seed (int, optional): The random seed for the train-test split. Defaults to 42. @@ -173,10 +174,11 @@ def train_models( random_search = RandomizedSearchCV( model, param_distributions=param_grid, - n_iter=10, + n_iter=20, scoring="neg_mean_squared_error", n_jobs=-1, cv=kfold.split(self.x_train, self.y_train), + refit=True, verbose=3, random_state=seed, ) @@ -187,8 +189,6 @@ def train_models( params = random_search.best_params_ print(f"Best Model:\n{best_model}") - # train the best model on the full training set - best_model.fit(self.x_train, self.y_train) y_pred = best_model.predict(self.x_test) mse = mean_squared_error(self.y_test, y_pred) print(f"Mean Squared Error: {mse}") @@ -286,7 +286,8 @@ def forward(self, x): Args: x (torch.Tensor): The input data. """ - return self.net(x) + y = self.net(x) + return y def __init__( self, @@ -319,7 +320,7 @@ def train_models( param_grid: dict = None, cv: int = 3, seed: int = 42, - epochs: int = 50, + epochs: int = 40, ) -> NeuralNetRegressor: """Train PyTorch models with cross-validation and grid search. @@ -331,22 +332,24 @@ def train_models( seed (int, optional): The random seed for the train-test split. Defaults to 42. epochs (int, optional): The number of training epochs. - Defaults to 50. + Defaults to 40. Returns: NeuralNetRegressor: The best trained PyTorch model. """ if param_grid is None: param_grid = { - "module__hidden_size": randint(20, 250), - "module__num_layers": randint(4, 7), + "module__hidden_size": randint(20, 300), + "module__num_layers": randint(4, 8), "module__dropout": uniform(0.0, 0.2), - "lr": uniform(0.001, 0.1), + "lr": uniform(0.001, 0.01), } input_size = self.x_train.shape[1] output_size = self.y_train.shape[1] + # set seed for reproducibility torch.manual_seed(seed) + net = NeuralNetRegressor( module=self.Net, module__input_size=input_size, @@ -357,29 +360,48 @@ def train_models( max_epochs=epochs, lr=0.1, optimizer=torch.optim.SGD, + optimizer__weight_decay=0.0005, + device="cuda" if torch.cuda.is_available() else "cpu", + batch_size=256, ) kfold = KFold(n_splits=cv, random_state=seed, shuffle=True) random_search = RandomizedSearchCV( net, param_distributions=param_grid, - n_iter=2, + n_iter=20, scoring="neg_mean_squared_error", - n_jobs=-1, + n_jobs=3, cv=kfold.split(self.x_train, self.y_train), verbose=3, random_state=seed, + refit=False, ) random_search.fit(self.x_train, self.y_train) - best_model = random_search.best_estimator_ - self.model = best_model - params = random_search.best_params_ - print(f"Best Model:\n{best_model}") + # find the best model from the search manually as refit is False + results = pd.DataFrame(random_search.cv_results_) + best_idx = results["rank_test_score"].idxmin() + best_params = results.loc[best_idx, "params"] + print(f"Best Parameters:\n{best_params}") + bestnet = NeuralNetRegressor( + module=self.Net, + module__input_size=input_size, + module__hidden_size=best_params["module__hidden_size"], + module__output_size=output_size, + module__num_layers=best_params["module__num_layers"], + module__dropout=best_params["module__dropout"], + max_epochs=epochs, + lr=best_params["lr"], + optimizer=torch.optim.SGD, + optimizer__weight_decay=0.0005, + device="cuda" if torch.cuda.is_available() else "cpu", + batch_size=256, + ) + bestnet.fit(self.x_train, self.y_train) + self.model = bestnet - # train the best model on the full training set - best_model.fit(self.x_train, self.y_train) - y_pred = best_model.predict(self.x_test) + y_pred = self.model.predict(self.x_test) mse = mean_squared_error(self.y_test, y_pred) print(f"Mean Squared Error: {mse}") @@ -387,8 +409,8 @@ def train_models( self._dir_out.mkdir(exist_ok=True) filename = self._dir_out / "neural_network_model.pkl" with open(filename, "wb") as f: - torch.save(best_model, f) - pd.DataFrame(params, index=[0]).to_csv( + torch.save(self.model, f) + pd.DataFrame(best_params, index=[0]).to_csv( self._dir_out / "neural_network_params.csv", index=False )