diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..3a7c63a 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..5588004 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..a89dce2 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..76c0aed 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd @@ -6,3 +7,17 @@ # Write your Solution here: +def outlier_removal(data): + qApplicantIncome = data['ApplicantIncome'].quantile(0.95) + qCoapplicantIncome = data['CoapplicantIncome'].quantile(0.98) + qLoanAmount = data['LoanAmount'].quantile(0.975) + + data = data[ + (data['ApplicantIncome'] < qApplicantIncome) & + (data['CoapplicantIncome'] < qCoapplicantIncome) & + (data['LoanAmount'] < qLoanAmount) + ] + return data + + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..2d672c6 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..4302c1b 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3a728e2 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..befbf86 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..b565b4a 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) @@ -5,10 +6,34 @@ import numpy as np from sklearn.model_selection import train_test_split from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal +from sklearn.preprocessing import Imputer loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) - # Write your solution here : +def data_cleaning(data): + + + + numeric_data = data._get_numeric_data() + categorical_data = data[list(set(data.columns) - set(numeric_data.columns))] + + imputer = Imputer(missing_values=np.nan,strategy='mean',axis=0) + numeric_data = pd.DataFrame(imputer.fit_transform(numeric_data), columns=numeric_data.columns) + + for column in categorical_data.columns: + categorical_data[column].replace(np.nan, categorical_data[column].mode()) + + data = pd.concat([numeric_data, categorical_data], 1) + + X = data.drop('Loan_Status', 1) + y = data['Loan_Status'] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9) + return X, y, X_train, X_test, y_train, y_test +# return categorical_data + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..da6cfcb Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..37b6501 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4720e43 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..829cc54 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..31c95e4 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,6 +1,8 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np +from sklearn.preprocessing import LabelEncoder from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal @@ -9,5 +11,34 @@ loan_data = outlier_removal(loan_data) X, y, X_train, X_test, y_train, y_test = data_cleaning(loan_data) +print(X_train.shape) # Write your solution here : +def data_cleaning_2(X_train, X_test, y_train, y_test): + + finalX = list() + for data in [X_train, X_test]: + numeric_data = data._get_numeric_data() + categorical_data = data[list(set(data.columns) - set(numeric_data.columns))] + + for column in numeric_data.columns: + numeric_data[column] = numeric_data[column].apply(np.sqrt) + + for column in categorical_data.columns: + le = LabelEncoder() + categorical_data[column] = le.fit_transform(categorical_data[column]) + + data = pd.concat([numeric_data, categorical_data], 1) + print(data.shape) + finalX.append(data) + + X_train = finalX[0] + X_test = finalX[1] + le = LabelEncoder() + y_train = pd.Series(le.fit_transform(y_train)) + y_test = pd.Series(le.fit_transform(y_test)) + + return X_train, X_test, y_train, y_test +data_cleaning_2(X_train, X_test, y_train, y_test) + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..911aedb Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..7227768 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..6306291 Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..785b689 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..0433830 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,22 @@ # Write your solution code here: +def logistic_regression(X_train, X_test, y_train, y_test): + + columns_to_scale = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'] + + for column in columns_to_scale: + standard_scaler = StandardScaler() + X_train[column] = standard_scaler.fit_transform(X_train[[column]]) + X_test[column] = standard_scaler.fit_transform(X_test[[column]]) + + model = LogisticRegression(random_state=9) + + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + + return confusion_matrix(y_test, y_pred) + + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b718393 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..3e1344c Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ