diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..25d864e 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..d6ce57b 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..bb21db8 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..1fb63c9 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd @@ -6,3 +7,33 @@ # Write your Solution here: +def outlier_removal(loan_data): + + ApplicantIncome_95 = loan_data['ApplicantIncome'].quantile(0.95) + CoapplicantIncome_95 = loan_data['CoapplicantIncome'].quantile(0.95) + LoanAmount_95 = loan_data['LoanAmount'].quantile(0.95) + + loan_data.drop(loan_data[loan_data['ApplicantIncome']>ApplicantIncome_95].index,inplace=True) + loan_data.drop(loan_data[loan_data['CoapplicantIncome']>CoapplicantIncome_95].index,inplace=True) + loan_data.drop(loan_data[loan_data['LoanAmount']>LoanAmount_95].index,inplace=True) + + return loan_data + + +# loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') +# loan_data = loan_data.drop('Loan_ID', 1) + +#Call tothe function - +outlier_removal(loan_data) +# def outlier_removal(data): +# q1=loan_data['ApplicantIncome'].quantile(0.95) +# q2=loan_data['CoapplicantIncome'].quantile(0.95) +# q3=loan_data['LoanAmount'].quantile(0.95) + +# print(q1,q2,q3) +# df =loan_data.drop(loan_data[(loan_data['ApplicantIncome']>q1)].index) +# df1=df.drop(df[(df['CoapplicantIncome']>q2)].index) +# df2=df1.drop(df1[(df1['LoanAmount']>q3)].index) +# return df2 + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..97f8adc 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..a5a7e3f 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..914b183 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3eb54d2 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..2898c98 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) @@ -10,5 +11,32 @@ loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) - # Write your solution here : + +def data_cleaning(df): + + num_cols = df._get_numeric_data().columns + tot_cols = df.columns + cat_cols = set(tot_cols)-set(num_cols) + loan_data.loc[:,cat_cols] + + for col in num_cols: + df[col].fillna(df[col].mean(),inplace=True) + + for col in cat_cols: + df[col].fillna(df[col].mode(),inplace=True) + + X = df.iloc[:,:-1] + y = df.iloc[:,-1] + + X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 9) + + return X,y,X_train,X_test,y_train,y_test + + + + + + +data_cleaning(loan_data) + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..912a030 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..7e6580a Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4666855 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..a29dbf4 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..f64827e 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,8 +1,10 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal +from sklearn.preprocessing import LabelEncoder loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) @@ -10,4 +12,91 @@ X, y, X_train, X_test, y_train, y_test = data_cleaning(loan_data) -# Write your solution here : +# # Write your solution here : + +def data_cleaning_2(X_train,X_test, y_train, y_test): + + label_encoder = LabelEncoder() + + #For Train data + num_train_cols = X_train._get_numeric_data().columns + tot_train_cols = X_train.columns + cat_train_cols = set(tot_train_cols)-set(num_train_cols) + + for col in num_train_cols: + X_train[col]=np.sqrt(X_train[col]) + + for col in ['Dependents','Property_Area']: + X_test = pd.get_dummies(X_test, columns=[col]) + + for col in ['Gender','Married','Education','Self_Employed']: + X_test[col] = label_encoder.fit_transform(X_test[col]) + + #For Test data + num_test_cols = X_test._get_numeric_data().columns + tot_test_cols = X_test.columns + cat_test_cols = set(tot_test_cols)-set(num_test_cols) + + for col in num_test_cols: + X_test[col]=np.sqrt(X_test[col]) + + for col in ['Dependents','Property_Area']: + X_test = pd.get_dummies(X_test, columns=[col]) + + for col in ['Gender','Married','Education','Self_Employed']: + X_test[col] = label_encoder.fit_transform(X_test[col]) + + return X_train,X_test,y_train, y_test + + + +# data_cleaning_2(X_train,X_test, y_train, y_test) +# def data_cleaning_2(X_train,X_test,y_train,y_test): +# X_train['ApplicantIncome']=np.sqrt(X_train['ApplicantIncome']) +# X_test['ApplicantIncome']=np.sqrt(X_test['ApplicantIncome']) +# X_train['CoapplicantIncome']=np.sqrt(X_train['CoapplicantIncome']) +# X_test['CoapplicantIncome']=np.sqrt(X_test['CoapplicantIncome']) +# X_train['LoanAmount']=np.sqrt(X_train['LoanAmount']) +# X_test['LoanAmount']=np.sqrt(X_test['LoanAmount']) + +# lablel_encoder = LabelEncoder() +# X_train['Gender'] = lablel_encoder.fit_transform(X_train['Gender']) +# X_train['Married'] = lablel_encoder.fit_transform(X_train['Married']) +# X_train['Education'] = lablel_encoder.fit_transform(X_train['Education']) +# X_train['Self_Employed'] = lablel_encoder.fit_transform(X_train['Self_Employed']) + +# X_test['Gender'] = lablel_encoder.fit_transform(X_test['Gender']) +# X_test['Married'] = lablel_encoder.fit_transform(X_test['Married']) +# X_test['Education'] = lablel_encoder.fit_transform(X_test['Education']) +# X_test['Self_Employed'] = lablel_encoder.fit_transform(X_test['Self_Employed']) + + +# numericals_train = X_train.select_dtypes(include=[np.number]) +# categoricals_train = X_train.select_dtypes(exclude=[np.number]) +# dummies_train=pd.get_dummies(categoricals_train) +# dummies_train_1=dummies_train.loc[:,'Dependents_0':'Dependents_3+'] +# dummies_train_2=dummies_train.loc[:,'Property_Area_Rural':'Property_Area_Urban'] +# dummies_train_final=pd.concat([dummies_train_1,dummies_train_2],axis=1) +# final_X_train=pd.concat([X_train, dummies_train_final], axis = 1) + +# final_X_train=final_X_train.drop('Dependents',axis=1) +# final_X_train=final_X_train.drop('Property_Area',axis=1) +# final_X_train=final_X_train.drop('Credit_History',axis=1) +# final_X_train=final_X_train.drop('Loan_Amount_Term',axis=1) + +# numericals_test = X_test.select_dtypes(include=[np.number]) +# categoricals_test = X_test.select_dtypes(exclude=[np.number]) +# dummies_test=pd.get_dummies(categoricals_test) +# dummies_test_1=dummies_test.loc[:,'Dependents_0':'Dependents_3+'] +# dummies_test_2=dummies_test.loc[:,'Property_Area_Rural':'Property_Area_Urban'] +# dummies_test_final=pd.concat([dummies_test_1,dummies_test_2],axis=1) +# final_X_test=pd.concat([X_test, dummies_test_final], axis = 1) + +# final_X_test=final_X_test.drop('Dependents',axis=1) +# final_X_test=final_X_test.drop('Property_Area',axis=1) +# final_X_test=final_X_test.drop('Credit_History',axis=1) +# final_X_test=final_X_test.drop('Loan_Amount_Term',axis=1) + + +# return final_X_train,final_X_test,y_train,y_test + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..02c3aa6 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..09159ab Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f9a149e Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..f980fe2 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..fd088f0 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,18 @@ # Write your solution code here: +def logistic_regression(X_train, X_test, y_train, y_test): + stand_scale = StandardScaler() + X_train.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']]=stand_scale.fit_transform(X_train.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']]) + X_test.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']]=stand_scale.fit_transform(X_test.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']]) + + model = LogisticRegression() + model.fit(X_train,y_train) + y_pred = model.predict(X_test) + + return confusion_matrix(y_test,y_pred) + + + +logistic_regression(X_train, X_test, y_train, y_test) diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..c060927 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..d6c58ba Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ