|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Mon Mar 16 10:59:17 2020 |
| 4 | +
|
| 5 | +@author: alborsa1 |
| 6 | +""" |
| 7 | +# %% Import Libraries |
| 8 | +import pandas as pd |
| 9 | +from sklearn.neighbors import KNeighborsClassifier |
| 10 | +from sklearn.model_selection import train_test_split |
| 11 | +import matplotlib.pyplot as plt |
| 12 | +from sklearn import preprocessing |
| 13 | +import sklearn.metrics as metrics |
| 14 | + |
| 15 | + |
| 16 | +# %% Function for Features Engineering |
| 17 | + |
| 18 | +# ----------------------------------------------------------------------------- |
| 19 | +# Function for getting mapping from Encoding function |
| 20 | +def get_integer_mapping(le): |
| 21 | + ''' |
| 22 | + Return a dict mapping labels to their integer values |
| 23 | + from an SKlearn LabelEncoder |
| 24 | + le = a fitted SKlearn LabelEncoder |
| 25 | + ''' |
| 26 | + res = {} |
| 27 | + for cl in le.classes_: |
| 28 | + res.update({cl: le.transform([cl])[0]}) |
| 29 | + |
| 30 | + return res |
| 31 | + |
| 32 | + |
| 33 | +def Encode_fields(PandasDF, fields): |
| 34 | + ''' |
| 35 | + Return the dataframe with the fields encoded and the mapping infos |
| 36 | + INPUT: |
| 37 | + - PandasDF : Dataframe |
| 38 | + - fields : List of fields |
| 39 | + ''' |
| 40 | + Mapping = [] |
| 41 | + for field in fields: |
| 42 | + print("Encoding.. :", field) |
| 43 | + TempDF = PandasDF.loc[:, field].copy() |
| 44 | + TempDF.loc[TempDF.isnull() == True] = '-99' |
| 45 | + Encoder = preprocessing.LabelEncoder() |
| 46 | + Fitted_Encoder = Encoder.fit(TempDF) |
| 47 | + Encoded_label = Fitted_Encoder.transform(TempDF) |
| 48 | + Mapping.append([field, get_integer_mapping(Encoder)]) |
| 49 | + PandasDF.loc[:, field] = Encoded_label |
| 50 | + |
| 51 | + return PandasDF, Mapping |
| 52 | + |
| 53 | + |
| 54 | +# ----------------------------------------------------------------------------- |
| 55 | + |
| 56 | +# Function for scaling features |
| 57 | +def scaleFeaturesDF(data_train): |
| 58 | + ''' Feature scaling is a type of transformation that only changes the |
| 59 | + scale, but not number of features. Because of this, we can still |
| 60 | + use the original dataset's column names... so long as we keep in |
| 61 | + mind that the _units_ have been altered: |
| 62 | + |
| 63 | + Method: preprocessing.StandardScaler() |
| 64 | + |
| 65 | + INPUT: |
| 66 | + - df : Pandas dataframe for training scaling features |
| 67 | + |
| 68 | + OUTPUT: |
| 69 | + - data_train : data_train transformed |
| 70 | + - transf : model used for scaling variable of Training Dataset |
| 71 | + |
| 72 | + ''' |
| 73 | + X = data_train.columns |
| 74 | + |
| 75 | + transf = preprocessing.StandardScaler(with_mean=True).fit(data_train) |
| 76 | + data_train = transf.transform(data_train) |
| 77 | + data_train = pd.DataFrame(data=data_train, columns=X) |
| 78 | + |
| 79 | + return data_train, transf |
| 80 | + |
| 81 | + |
| 82 | +# ----------------------------------------------------------------------------- |
| 83 | + |
| 84 | +# %% Function for Training Classification |
| 85 | + |
| 86 | +# Split in train and test datasest |
| 87 | +def split(split_dataset, X, y, perc_testing): |
| 88 | + ''' |
| 89 | + Input: |
| 90 | + - split_dataset : True or False |
| 91 | + - X : Features Dataset |
| 92 | + - y : Label Dataset |
| 93 | + - perc_testing : dataset percentage to assign to testing phase |
| 94 | + Output: |
| 95 | + - data_train : dataset to train model |
| 96 | + - data_test : dataset to test model |
| 97 | + - label_train : label of train dataset |
| 98 | + - label_test : labelt of test dataset |
| 99 | + ''' |
| 100 | + if split_dataset: |
| 101 | + data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=perc_testing, random_state=7) |
| 102 | + print("##--**: Complete to Split Dataset") |
| 103 | + print("Testing dataset dimension equal to", perc_testing * 100, "% of the initial dataset") |
| 104 | + else: |
| 105 | + data_train, data_test, label_train, label_test = X, X, y, y |
| 106 | + print("##--**: Dataset no splitted") |
| 107 | + |
| 108 | + return data_train, data_test, label_train, label_test |
| 109 | + |
| 110 | + |
| 111 | +# ----------------------------------------------------------------------------- |
| 112 | + |
| 113 | +# ----------------------------------------------------------------------------- |
| 114 | +# Function for computing classification evaluation |
| 115 | +def compute_evaluation_stats(label_test, prediction_test): |
| 116 | + ''' |
| 117 | + |
| 118 | + ''' |
| 119 | + y_true = label_test.values |
| 120 | + y_pred = prediction_test |
| 121 | + columns = ['Closed', 'Open'] |
| 122 | + confusion = metrics.confusion_matrix(y_true, y_pred) |
| 123 | + plt.imshow(confusion, cmap=plt.cm.Blues, interpolation='nearest') |
| 124 | + plt.xticks([0, 1], columns, rotation='vertical') |
| 125 | + plt.yticks([0, 1], columns) |
| 126 | + plt.colorbar() |
| 127 | + plt.show() |
| 128 | + |
| 129 | + tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel() |
| 130 | + print("True Positive:", tp) |
| 131 | + print("True Negative:", tn) |
| 132 | + print("False Positive:", fp) |
| 133 | + print("False Negative:", fn) |
| 134 | + |
| 135 | + precision = tp / (tp + fp) |
| 136 | + recall = tp / (tp + fn) |
| 137 | + print("Precision:", precision) |
| 138 | + print("Recall:", recall) |
| 139 | + |
| 140 | + return precision, recall |
| 141 | + |
| 142 | + |
| 143 | +# ----------------------------------------------------------------------------- |
| 144 | + |
| 145 | +# ----------------------------------------------------------------------------- |
| 146 | +# Function for K-Neighbors |
| 147 | +def kneigh(df_train, df_test, label_train, label_test): |
| 148 | + ''' |
| 149 | + Function for Training a K-Neighbors Classifier |
| 150 | + Input: |
| 151 | + - df_train : Features dataset for training model |
| 152 | + - df_test : Features dataset for testing model |
| 153 | + - label_train : Label training dataset |
| 154 | + - label_test : Label testing dataset |
| 155 | + |
| 156 | + Output: |
| 157 | + - knmodel : Model |
| 158 | + - knmodel_stats : Statistics about the model |
| 159 | + ''' |
| 160 | + # Set model parameters |
| 161 | + print("##--**: Computing K-Neighbors classifier..") |
| 162 | + neighbors = 5 |
| 163 | + print("##--**: N-Neighbors:", neighbors) |
| 164 | + |
| 165 | + # Define model |
| 166 | + knmodel = KNeighborsClassifier(n_neighbors=neighbors, weights='uniform') |
| 167 | + |
| 168 | + # Train model |
| 169 | + print("##--**.a: Train KNeighborsClassifier model..") |
| 170 | + knmodel = knmodel.fit(df_train, label_train) |
| 171 | + |
| 172 | + # Calculate and display the accuracy of the training set |
| 173 | + accuracy_training_knmodel = knmodel.score(df_train, label_train) |
| 174 | + print("Scoring model (accuracy), on training dataset:", accuracy_training_knmodel) |
| 175 | + |
| 176 | + # Compute Prediction on testing dataset |
| 177 | + prediction_test = knmodel.predict(df_test) |
| 178 | + |
| 179 | + # Calculate and display the accuracy of the testing set |
| 180 | + accuracy_testing__knmodel = knmodel.score(df_test, label_test) |
| 181 | + print("Scoring model (accuracy), on testing dataset:", accuracy_testing__knmodel) |
| 182 | + |
| 183 | + # Calculate Evaluation Statistics |
| 184 | + precision_knmodel, recall_knmodel = compute_evaluation_stats(label_test, prediction_test) |
| 185 | + |
| 186 | + knmodel_stats = ['KNeighborsClassifier', accuracy_training_knmodel, accuracy_testing__knmodel, precision_knmodel, |
| 187 | + recall_knmodel] |
| 188 | + |
| 189 | + return knmodel, knmodel_stats |
| 190 | + |
| 191 | +# ----------------------------------------------------------------------------- |
0 commit comments