diff --git a/build.py b/build.py index c7d540b..23433d1 100644 --- a/build.py +++ b/build.py @@ -1,22 +1,39 @@ -def get_categorical_variables(df): - return [] +import numpy as np +import pandas as pd +from pandas import Series, DataFrame +import operator +import matplotlib.pyplot as plt +df = pd.read_csv('data/employee_retention_data.csv') -def get_numerical_variables(df): - return [] +def get_categorical_variables(df): + return df[['dept','join_date','quit_date']] +def get_numerical_variables(df): + return df.drop(['dept','join_date','quit_date'], axis=1) def get_numerical_variables_percentile(df): - pass - + df_tp = get_numerical_variables(df) + return df_tp.describe().T def get_categorical_variables_modes(df): - pass - + categorical_df = get_categorical_variables(df) + return categorical_df.mode() def get_missing_values_count(df): - pass - + return pd.DataFrame(pd.isnull(df).sum().rename('NA_count')) def plot_histogram_with_numerical_values(df): - pass + numerical_df = get_numerical_variables(df) + plt.subplot(221) + plt.title(num_df.columns[0]) + sns.distplot(num_df.iloc[:,0], color='yellow', fit=norm, kde=False) + plt.subplot(222) + plt.title(num_df.columns[1]) + sns.distplot(num_df.iloc[:,1], color='yellow', fit=norm, kde=False) + plt.subplot(223) + plt.title(num_df.columns[2]) + sns.distplot(num_df.iloc[:,2], color='yellow', fit=norm, kde=False) + plt.subplot(224) + plt.title(num_df.columns[3]) + sns.distplot(num_df.iloc[:,3], color='yellow', fit=norm, kde=False) diff --git a/build.pyc b/build.pyc new file mode 100644 index 0000000..a3a3e59 Binary files /dev/null and b/build.pyc differ diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000..045874b Binary files /dev/null and b/tests/__init__.pyc differ diff --git a/tests/test_get_categorical_variables.pyc b/tests/test_get_categorical_variables.pyc new file mode 100644 index 0000000..2dcd747 Binary files /dev/null and b/tests/test_get_categorical_variables.pyc differ