diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..25e44ee 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..33a9c9f 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..d7e7c34 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..64ce8a3 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,8 +1,30 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) - # Write your Solution here: + +#num_feature_data = loan_data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] + +# Write your code here: +def outlier_removal(loan_data): + #df = num_feature_data + #df = df.drop(df[(df['ApplicantIncome']>df['ApplicantIncome'].quantile(0.95)) | (df['CoapplicantIncome']>df['CoapplicantIncome'].quantile(0.95)) | (df['LoanAmount']>df['LoanAmount'].quantile(0.95))].index) + + loan_data = loan_data.drop(loan_data[(loan_data['ApplicantIncome']>loan_data['ApplicantIncome'].quantile(0.95)) | (loan_data['CoapplicantIncome']>loan_data['CoapplicantIncome'].quantile(0.95)) | (loan_data['LoanAmount']>loan_data['LoanAmount'].quantile(0.95))].index) + + #loan_data.head() + + #print (df) + + return loan_data + + +outlier_removal(loan_data) + +loan_data.head() + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..e0e9e12 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..c981b99 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f12e418 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..aa6c288 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..5228248 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,14 +1,48 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split +from sklearn.preprocessing import Imputer +from sklearn.model_selection import train_test_split as tts from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) - # Write your solution here : + +def data_cleaning(loan_data): + + imp_mean = Imputer(missing_values = float('NaN'), strategy='mean') + #imp_mean.fit(loan_data[['LoanAmount']]) + #loan_data['LoanAmount'] = imp_mean.transform(loan_data[['LoanAmount']]) + loan_data['LoanAmount'] = imp_mean.fit_transform(loan_data[['LoanAmount']]).ravel() + + loan_data['Gender'] = loan_data['Gender'].fillna(loan_data['Gender'].mode()[0]) + loan_data['Married'] = loan_data['Married'].fillna(loan_data['Married'].mode()[0]) + loan_data['Dependents'] = loan_data['Dependents'].fillna(loan_data['Dependents'].mode()[0]) + loan_data['Self_Employed'] = loan_data['Self_Employed'].fillna(loan_data['Self_Employed'].mode()[0]) + loan_data['Loan_Amount_Term'] = loan_data['Loan_Amount_Term'].fillna(loan_data['Loan_Amount_Term'].mode()[0]) + loan_data['Credit_History'] = loan_data['Credit_History'].fillna(loan_data['Credit_History'].mode()[0]) + + #np.random.seed(9) + X=loan_data.drop(['LoanAmount'],axis=1) + y=loan_data['LoanAmount'] + + X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.25, random_state = 9) + + + #return loan_data.head() + + return (X,y,X_train, X_test, y_train, y_test) + +data_cleaning(loan_data) + + + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..27fe786 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..b179f73 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a25f292 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3d54dad Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..651d338 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,6 +1,11 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np +from math import sqrt +from sklearn import preprocessing +from sklearn.preprocessing import LabelEncoder + from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal @@ -9,5 +14,25 @@ loan_data = outlier_removal(loan_data) X, y, X_train, X_test, y_train, y_test = data_cleaning(loan_data) - # Write your solution here : +def data_cleaning_2(X_train,X_test,y_train,y_test): + for x in ['CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']: + + X_train[x] = np.sqrt(X_train[x]) + X_test[x]= np.sqrt(X_test[x]) + + X_train1 = pd.get_dummies(X_train) + X_test1 = pd.get_dummies(X_test) + + X_train1=X_train1.drop(['Dependents_0','Gender_Female','Education_Graduate','Self_Employed_No','Married_No','Property_Area_Rural'],axis=1) + + X_test1=X_test1.drop(['Dependents_0','Gender_Female','Education_Graduate','Self_Employed_No','Married_No','Property_Area_Rural'],axis=1) + + return X_train1,X_test1,y_train,y_test + +data_cleaning_2(X_train, X_test, y_train, y_test) + + +y_train + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..66c4e21 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..133943b Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..87bac89 Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..94c33f2 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..6282b0f 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,20 @@ # Write your solution code here: +def logistic_regression(X_train,X_test,y_train,y_test): + + scale = StandardScaler() + scale.fit(X_train[['ApplicantIncome','CoapplicantIncome','LoanAmount']]) + log_reg = LogisticRegression() + log_reg.fit(X_train,y_train) + + y_pred = log_reg.predict(X_test) + conf_matrix = confusion_matrix(y_test,y_pred) + + # print (y_pred, conf_matrix) + + return conf_matrix + +logistic_regression(X_train,X_test,y_train,y_test) + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..81f72aa Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..c986977 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ