diff --git a/build.py b/build.py index c7d540b..461f473 100644 --- a/build.py +++ b/build.py @@ -1,22 +1,38 @@ +import pandas as pd +import seaborn as sns +from scipy.stats import norm + def get_categorical_variables(df): - return [] + columns = ['dept', 'join_date', 'quit_date'] + for col in columns: + df[col] = df[col].astype('category') + return columns def get_numerical_variables(df): - return [] + numerical_variables = list(df.select_dtypes(include=['float64', 'int64'])) + return numerical_variables def get_numerical_variables_percentile(df): - pass + return df.describe().T def get_categorical_variables_modes(df): - pass + return df[get_categorical_variables(df)].mode() def get_missing_values_count(df): - pass + return pd.DataFrame(df.isnull().sum()) def plot_histogram_with_numerical_values(df): - pass + plt.figure(figsize = (15,6)) + plt.subplot(221) + sns.distplot(df[numerical_variables[0]], bins=10, color='yellow', fit=norm, kde=False) + plt.subplot(222) + sns.distplot(df[numerical_variables[1]], bins=10, color='yellow', fit=norm, kde=False) + plt.subplot(223) + sns.distplot(df[numerical_variables[2]], bins=10, color='yellow', fit=norm, kde=False) + plt.subplot(224) + sns.distplot(df[numerical_variables[3]], bins=10, color='yellow', fit=norm, kde=False) diff --git a/build.pyc b/build.pyc new file mode 100644 index 0000000..33f3100 Binary files /dev/null and b/build.pyc differ diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000..dfe282b Binary files /dev/null and b/tests/__init__.pyc differ diff --git a/tests/test_get_categorical_variables.pyc b/tests/test_get_categorical_variables.pyc new file mode 100644 index 0000000..8a78b2b Binary files /dev/null and b/tests/test_get_categorical_variables.pyc differ