diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..0da7c21 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..895aa59 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..68e8da3 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..4359f4a 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd @@ -6,3 +7,13 @@ # Write your Solution here: +def outlier_removal(data): + q1=loan_data['ApplicantIncome'].quantile(0.95) + q2=loan_data['CoapplicantIncome'].quantile(0.95) + q3=loan_data['LoanAmount'].quantile(0.95) + df =loan_data.drop(loan_data[(loan_data['ApplicantIncome']>q1)].index) + df1=df.drop(df[(df['CoapplicantIncome']>q2)].index) + df2=df1.drop(df1[(df1['LoanAmount']>q3)].index) + return df2 + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..9e9f97c 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..a8d1970 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b1a2894 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..8f9e0ad Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..023f1a9 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,8 +1,10 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) import pandas as pd import numpy as np +import statistics from sklearn.model_selection import train_test_split from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal @@ -12,3 +14,22 @@ # Write your solution here : +def data_cleaning(data): + + categoricals = loan_data.select_dtypes(exclude=[np.number]) + numericals = loan_data.select_dtypes(include=[np.number]) + numericals['LoanAmount'].fillna(numericals['LoanAmount'].mean(),inplace=True) + numericals['Loan_Amount_Term'].fillna(statistics.mode(numericals['Loan_Amount_Term'].values), inplace = True) + numericals['Credit_History'].fillna(statistics.mode(numericals['Credit_History'].values), inplace = True) + categoricals['Gender'].fillna(statistics.mode(categoricals['Gender'].values), inplace = True) + categoricals['Married'].fillna(statistics.mode(categoricals['Married'].values), inplace = True) + categoricals['Dependents'].fillna(statistics.mode(categoricals['Dependents'].values), inplace = True) + categoricals['Self_Employed'].fillna(statistics.mode(categoricals['Self_Employed'].values), inplace = True) + X=loan_data.iloc[:,:-1] + y=loan_data.iloc[:,-1] + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9, test_size=0.25) + return X,y,X_train,X_test,y_train,y_test + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..fc3c04b Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..70ecc00 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..aa2a9a5 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..d1b652c Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..c28b53b 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,6 +1,9 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np +from sklearn.preprocessing import LabelEncoder + from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal @@ -11,3 +14,53 @@ # Write your solution here : +def data_cleaning_2(X_train,X_test,y_train,y_test): + X_train['ApplicantIncome']=np.sqrt(X_train['ApplicantIncome']) + X_test['ApplicantIncome']=np.sqrt(X_test['ApplicantIncome']) + X_train['CoapplicantIncome']=np.sqrt(X_train['CoapplicantIncome']) + X_test['CoapplicantIncome']=np.sqrt(X_test['CoapplicantIncome']) + X_train['LoanAmount']=np.sqrt(X_train['LoanAmount']) + X_test['LoanAmount']=np.sqrt(X_test['LoanAmount']) + + lablel_encoder = LabelEncoder() + X_train['Gender'] = lablel_encoder.fit_transform(X_train['Gender']) + X_train['Married'] = lablel_encoder.fit_transform(X_train['Married']) + X_train['Education'] = lablel_encoder.fit_transform(X_train['Education']) + X_train['Self_Employed'] = lablel_encoder.fit_transform(X_train['Self_Employed']) + + X_test['Gender'] = lablel_encoder.fit_transform(X_test['Gender']) + X_test['Married'] = lablel_encoder.fit_transform(X_test['Married']) + X_test['Education'] = lablel_encoder.fit_transform(X_test['Education']) + X_test['Self_Employed'] = lablel_encoder.fit_transform(X_test['Self_Employed']) + + + numericals_train = X_train.select_dtypes(include=[np.number]) + categoricals_train = X_train.select_dtypes(exclude=[np.number]) + dummies_train=pd.get_dummies(categoricals_train) + dummies_train_1=dummies_train.loc[:,'Dependents_0':'Dependents_3+'] + dummies_train_2=dummies_train.loc[:,'Property_Area_Rural':'Property_Area_Urban'] + dummies_train_final=pd.concat([dummies_train_1,dummies_train_2],axis=1) + final_X_train=pd.concat([X_train, dummies_train_final], axis = 1) + + final_X_train=final_X_train.drop('Dependents',axis=1) + final_X_train=final_X_train.drop('Property_Area',axis=1) + final_X_train=final_X_train.drop('Credit_History',axis=1) + final_X_train=final_X_train.drop('Loan_Amount_Term',axis=1) + + numericals_test = X_test.select_dtypes(include=[np.number]) + categoricals_test = X_test.select_dtypes(exclude=[np.number]) + dummies_test=pd.get_dummies(categoricals_test) + dummies_test_1=dummies_test.loc[:,'Dependents_0':'Dependents_3+'] + dummies_test_2=dummies_test.loc[:,'Property_Area_Rural':'Property_Area_Urban'] + dummies_test_final=pd.concat([dummies_test_1,dummies_test_2],axis=1) + final_X_test=pd.concat([X_test, dummies_test_final], axis = 1) + + final_X_test=final_X_test.drop('Dependents',axis=1) + final_X_test=final_X_test.drop('Property_Area',axis=1) + final_X_test=final_X_test.drop('Credit_History',axis=1) + final_X_test=final_X_test.drop('Loan_Amount_Term',axis=1) + + + return final_X_train,final_X_test,y_train,y_test + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..8e52c37 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..a7f6bf0 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d3dd25b Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..c76a351 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..5b797a9 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,7 +1,9 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline from sklearn.metrics import confusion_matrix from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning @@ -15,4 +17,15 @@ # Write your solution code here: +def logistic_regression(X_train,X_test,y_train,y_test): + logistic_regressor = LogisticRegression(random_state=9) + scaler = StandardScaler() + pipeline = Pipeline(steps=[('scaler', scaler), + ('logistic_regression', logistic_regressor)]) + pipeline.fit(X_test, y_test) + y_pred = pipeline.predict(X_test) + cm=confusion_matrix(y_test,y_pred) + return cm + + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0273a44 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..0681446 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ