diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py new file mode 100644 index 0000000..0edbcf2 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py @@ -0,0 +1,130 @@ +# ============================================================================= +# KMeansClustering.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Randomly generate up to 10 centroids without issue. Each centroid will have a +# classification. The nearest centroid to a point will determine the point's +# classicfication (decide what to do if the distances are equal yourself). +# Create random test cases until centroids stop mocing and determine whether +# each case is likely to have CKD depending on the classification of the +# nearest centroid. +# Bonus: Create lines roughly separating each centroid group +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from scipy.spatial import KDTree as kdt + +# ============================================================================= +# Functions +# ============================================================================= +# randomCentroids function takes in an integer number of clusters to be +# generated. +# OR asks for k number of integer clusters +# Outputs a 2D array filled with random values between 0-1. The +# first column represents glucose and the second column represents hemoglobin. +# There are k number of rows representing the number of centroids and the +# classification of each centroid (i.e.: row index = classification value). +# OR you can have a third column with the classification value. +def randomCentroids(k): + return np.random.rand(k,2) + +# assignCentroids function takes in an array of normalized x (hemoglobin) and y +# (glucose) values from the CSV file and the randomly generated array of +# centroids from randomCentroids. Using the findDistance function from +# NearestNeighborClassifier, points are assigned the same classification as the +# nearest centroid. A 2D array of the normalized data and its classification +# are returned. +def assignToCentroids(normArr, centArr): + return kdt(centArr).query(normArr)[1] +# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]]))) + +# updateCentroids function inputs the 2D array of centroid locations and of +# classified and normalized CSV data. The average x (hemo) and y (gluc) +# positions of all data points for each classifications are found and an +# updated 2D array with these average cartesian points as the location for the +# new centroids is returned along with the original cartesian points. +#avg of all 1s will be new cent, avg of all 0s will be new cent + +def updateCentroids(centArr, classArr, normArr): + upCentArr = centArr.copy() + for i in range(len(centArr[:,0])): + upCentArr[i,0] = np.mean(normArr.gluc[classArr==i]) + upCentArr[i,1] = np.mean(normArr.hemo[classArr==i]) + return upCentArr +# centArr = np.array([[0.5, 0.5], [.25, .25]]) +# print(updateCentroids( +# centArr, assignToCentroids( +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr), +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')) +# )) +# print(centArr) + +# iterate void function can either +# a) input information and iterate the original information until centArr ~ +# upCentArr +def iterate(normArr, centArr): + # classArr = np.zeros(len(normArr.gluc)) + classArr = assignToCentroids(normArr, centArr) + upCentArr = updateCentroids(centArr, classArr, normArr) + # print(classArr) + if (upCentArr != centArr).any(): + centArr = upCentArr + return iterate(normArr, centArr) + return centArr +print(iterate( + NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]]) + )) + +# graphClusters void function takes in a 1D and a 2D numpy array to graph. The +# 1D array of centroid locations and classifactions have distinct points on the +# graph. The 2D array graphs points of normalized CSV data and colors them the +# same color as their corresponding centroids. A legend is generated in a +# reasonable position. +# Bonus: Create lines roughly separating each centroid group +def graphClusters(): + + return + +# dataAnalysis void function takes in the original parsed CSV classifications +# and the final classifications of the data based on K-means clustering (use of +# centroids) and compares the two to find false/true positives/negatives. +# Note: This should only run when there are two centroids (i.e.: k = 2) +# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True positive (sensitivity): Percentage of CKD patients were correctly +# labeled by K-Means +# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True negative (specificity): Percentage of non-CKD patients were correctly +# labelled by K-Means +# Note: True positive (~93 %) + False positive (~7%) = 100% +# Note: True Negative (~100%) + False negative (~0%) = 100% +def dataAnalysis(): + return +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0. +def mainDriver(): + # Open the CSV file using the parsing method from + # NearestNeighborClassifier. No input, outputs 2D numpy array. + NNC.openCSVFile + + # Normalize data using method from NearestNeighborClassifier. Input and + # outputs a 2D numpy array + NNC.normalizeData() + + # Graph CSV file using method from NearestNeighborClassifier. Input 2D + # numpy array. Void function. + NNC.graphCSVFile() + + return 0 \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py new file mode 100644 index 0000000..382bd59 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py @@ -0,0 +1,67 @@ +# ============================================================================= +# KNearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create a random test case and determine whether the case is +# likely to have CKD depending on the mode of the classifications of the +# k number of nearest points. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from statistics import mode + +# ============================================================================= +# Functions +# ============================================================================= +# findDistanceArray inputs a numpy array, a random point, and an integer k and +# uses the findDistance function from NearestNeighborClassifier. The function +# outputs a 1D array containing the k number of nearst points to the random +# test case. +def findDistanceArray(normArr, testCase, k): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + kindex = np.argsort(distArr)[:k] + return kindex + +# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays +# to graph. One of the 1D arrays is a random testCase with its own distinct +# points. The other 1D array is used to circle the k number of points closest +# to the test case. The 2D array contains information parsed from the CSV +# column. The first column (hemoglobin) is graphed as the x-axis and the second +# column (glucose) as the y-axis. The third column (classification) determines +# the color of the points. A legend is generated in a reasonable position. +def graphKNearestNeighbor(testCase, normArr, k): + kindex = findDistanceArray(normArr, testCase, k) + NNC.graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if mode(normArr.disease[kindex])==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex], + c='y', label = 'Nearest neighbor(s)') + print("butts") + plt.legend(fontsize="small") + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0.5 +def mainDriver(): + val = int(input("How many neighbors are you looking for: ")) + test = NNC.createTestCase() + normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv')) + graphKNearestNeighbor(test, normal, val) + return 0 +mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py new file mode 100644 index 0000000..6d0e806 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py @@ -0,0 +1,112 @@ +# ============================================================================= +# NearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create n number of random test cases and determine whether the case is +# likely to have CKD depending on the classification of the nearest point. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np + +# ============================================================================= +# Classes +# ============================================================================= +class Butts: + def __init__(self, data): + self.gluc = data[:,0] + self.hemo = data[:,1] + self.disease = data[:,2] + self.len = len(data) + self.all = data[:,:3] + self.paras = data[:,:2] + self.shape = np.shape(data) + self.colmax = np.amax(data, axis = 0) + self.colmin = np.amin(data, axis = 0) + +# ============================================================================= +# Functions +# ============================================================================= +# Parses in file and turns it into Butts class of data +def openCSVFile(fileName): + return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1)) + +# Takes in butts class +# Loops over data normalizing it for every row +# returns normalized butts class data +def normalizeData(dataArr): + normArr = np.zeros(dataArr.shape) + for i in range(len(normArr)): + normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin) + return Butts(normArr) + +# graphCSVFile void function takes in a 2D numpy array and graphs with the +# first column (hemoglobin) as the x-axis and second column (glucose) as the +# y-axis. The third column (classification) is used to determine the color of +# the points on the graph. +def graphCSVFile(normArr): + plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0], + c='b', label='No CKD' ) + plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1], + c='r', label='CKD') + plt.title('Hemoglobin and Glucose levels') + plt.xlabel('Hemoglobin') + plt.ylabel('Glucose') + return +# findDistance function is either: +# a) takes in an array and a point and returns an array of distances or the +# minimum distance or +# B) takes in cartesian coordinates and uses a simple use of the distance +# formula to return the distance between the two points. +def findDistance(x1, y1, x2, y2): + return np.sqrt((x1-x2)**2+(y1-y2)**2) + +# createTestCase function creates two random test cases (hemoglobin and +# glucose) from 0-1 and: +# creates a new 1D array with the two points +# return the points raw +def createTestCase(): + return np.random.rand(2) + +# nearestNeighborIndex takes in the test case point and returns the index of the +# nearest point to the test case +def nearestNeighborIndex(testCase, normArr): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + nni = distArr.argmin() + return nni + +# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian +# coordinate depending on createTestCase) and graphs the first column +# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis +# the third column (classification) determines the color of the points. A +# randomly generated test case is graphed as a distinct point with a +# line connecting it to the nearest neighbor whose classification it takes on. +# A legend is generated in a reasonable position. +def graphNearestNeighbor(testCase, normArr): + nni = nearestNeighborIndex(testCase, normArr) + graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if normArr.disease[nni]==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-') + plt.legend() + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in no inputs and graphs both the orginial CSV +# file and the test case. This function returns 0. +def mainDriver(): + graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv'))) + return 0 +# mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/README.md b/Summer-2020-Data-Analysis-Project-Brandon-Branch/README.md new file mode 100644 index 0000000..d1d53d0 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/README.md @@ -0,0 +1,97 @@ +# Summer-2020-ML-Project + +# Nearest Neighbor Classifier Script Description: +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Create n number of random test cases and determine whether the case is likely to have CKD depending on the classification of the nearest point. + +# Nearest Neighbor Classifier Function Descriptions: +openCSVFile function takes in no arguments and parses/organizes data from a CSV file into a 2-D numpy array with the columns being: +hemoglobin, glucose, classification and each row being a case. + +normalizeData function takes in a 2D numpy array and +scales down the first and second columns to range from 0-1 and +outputs a 2D array with the normalized data. + +graphCSVFile void function takes in a 2D numpy array and graphs with: +the first column (hemoglobin) as the x-axis and second column (glucose) as the y-axis. +The third column (classification) is used to determine the color of the points on the graph. + +findDistance function is either takes in cartesian coordinates and +uses a simple use of the distance formula +to return the distance between the two points. + +createTestCase function creates two random test cases (hemoglobin and glucose) from 0-1 and +creates/returns a new 1D array with the two points. + +graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian +coordinate depending on createTestCase) and +graphs the first column (hemoglobin) as the x-axis and the second column (glucose) as the y-axis. +The third column (classification) determines the color of the points. +A randomly generated test case is graphed as a distinct point with a line connecting it to the nearest neighbor whose classification it takes on. +A legend is generated in a reasonable position. + +mainDriver function takes in no inputs and graphs both the orginial CSV file and the test case. +This function returns 0. + +# K Nearest Nearest Neighbor Classifier Script Description: + +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Create n number of random test cases and determine whether the case is likely to have CKD depending on the mode of the classifications of the k number of nearest points. + +# K Nearest Nearest Neighbor Classifier Functions Descriptions: + +findDistanceArray inputs a numpy array, a random point, and an integer k and +uses the findDistance function from NearestNeighborClassifier. +The function outputs a 1D array containing the k number of nearst points to the random test case. + +graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays to graph. +One of the 1D arrays is a random testCase with its own distinct points. +The other 1D array is used to circle the k number of points closest to the test case. +The 2D array contains information parsed from the CSV column. +The first column (hemoglobin) is graphed as the x-axis and the second column (glucose) as the y-axis. +The third column (classification) determines the color of the points. +A legend is generated in a reasonable position. + +mainDriver function takes in nothing and graphs both the orginial CSV file, the k number of nearest neighbors, and the test case. +This function returns 0. + +# K Means Clustering Script Description: + +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Randomly generate up to 10 centroids without issue. +Each centroid will have a classification. +The nearest centroid to a point will determine the point's classicfication (decide what to do if the distances are equal yourself). +Create random test cases until centroids stop mocing and determine whether each case is likely to have CKD depending on the classification of the nearest centroid. + +# K Means ClusteringClassifier Functions Descriptions: + +randomCentroids function takes in an integer number of clusters to be generated. +OR asks for k number of integer clusters +Outputs a 2D array filled with random values between 0-1. +The first column represents hemoglobin and the second column represents glucose. +There are k number of rows representing the number of centroids and the classification of each centroid (i.e.: row index = classification value). +OR you can have a third column with the classification value. + +assignCentroids function takes in an array of normalized x (hemoglobin) and y (glucose) values from the CSV file and the randomly generated array of centroids from randomCentroids. +Using the findDistance function from NearestNeighborClassifier, points are assigned the same classification as the nearest centroid. +A 2D array of the normalized data and its classification are returned. + +updateCentroids function inputs the 2D array of centroid locations and of classified and normalized CSV data. +The average x (hemo) and y (gluc) positions of all data points for each classifications are found and +an updated 2D array with these average cartesian points as the location for the new centroids is returned along with the original cartesian points. + +iterate void function can either +a) input information and iterate the original information until centArr ~ upCentArr +b) don't input any information and run by itself. Similar to a main script +The function causes for the centroids to reassign points and update the centroid until the centroids do not move. + +graphClusters void function takes in a 1D and a 2D numpy array to graph. +The 1D array of centroid locations and classifactions have distinct points on the graph. +The 2D array graphs points of normalized CSV data and colors them the same color as their corresponding centroids. +A legend is generated in a reasonable position. + +dataAnalysis void function takes in the original parsed CSV classifications and the final classifications of the data based on K-means clustering (use of centroids) and +compares the two to find false/true positives/negatives. + +mainDriver function takes in nothing and graphs both the orginial CSV file, the k number of nearest neighbors, and the test case. +This function returns 0. diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/KNearestNeighborClassifier.cpython-37.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/KNearestNeighborClassifier.cpython-37.pyc new file mode 100644 index 0000000..14a61c6 Binary files /dev/null and b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/KNearestNeighborClassifier.cpython-37.pyc differ diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-37.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-37.pyc new file mode 100644 index 0000000..e3fd485 Binary files /dev/null and b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-37.pyc differ diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-38.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-38.pyc new file mode 100644 index 0000000..15a1e11 Binary files /dev/null and b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-38.pyc differ diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv b/Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv new file mode 100644 index 0000000..d071373 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv @@ -0,0 +1,159 @@ +Glucose,Hemoglobin,Class +117,11.2,1 +70,9.5,1 +380,10.8,1 +157,5.6,1 +173,7.7,1 +95,9.8,1 +264,12.5,1 +70,10,1 +253,10.5,1 +163,9.8,1 +129,9.1,1 +133,10.3,1 +76,7.1,1 +280,13,1 +210,16.1,1 +219,10.4,1 +295,9.2,1 +118,11.4,1 +224,8.1,1 +128,8.2,1 +118,12,1 +105,11.1,1 +288,7.9,1 +273,8.3,1 +122,12.6,1 +303,10.4,1 +102,8.7,1 +107,8.3,1 +117,10,1 +239,9.5,1 +94,9.9,1 +129,8.1,1 +252,11.2,1 +255,7.3,1 +253,10.9,1 +214,10.9,1 +490,11.5,1 +163,7.9,1 +241,9.6,1 +214,9.4,1 +106,8.6,1 +424,12.6,1 +176,3.1,1 +140,15,0 +70,17,0 +82,15.9,0 +119,15.4,0 +99,13,0 +121,13.6,0 +131,14.5,0 +91,14,0 +98,13.9,0 +104,16.1,0 +131,14.1,0 +122,17,0 +118,15.5,0 +117,16.2,0 +132,14.4,0 +97,14.2,0 +133,13.2,0 +122,13.9,0 +121,15,0 +111,14.3,0 +96,13.8,0 +139,14.8,0 +125,16.5,0 +123,15.7,0 +112,14.5,0 +140,16.3,0 +130,15.5,0 +123,14.6,0 +100,16.9,0 +94,16,0 +81,14.7,0 +93,16.6,0 +124,14.9,0 +89,16.7,0 +125,16.8,0 +91,13.5,0 +127,15.1,0 +96,16.9,0 +128,13.1,0 +122,17.1,0 +128,15.2,0 +137,13.6,0 +81,13.9,0 +102,13.2,0 +132,13.7,0 +104,17.3,0 +131,15.6,0 +102,15,0 +120,17.4,0 +105,15.7,0 +109,13.9,0 +130,15.9,0 +100,14,0 +109,15.8,0 +120,13.4,0 +80,14.1,0 +130,13.5,0 +99,17.7,0 +134,14.2,0 +92,14,0 +132,17.8,0 +88,13.3,0 +100,14.3,0 +130,13.4,0 +95,15,0 +111,16.2,0 +106,14.4,0 +97,13.5,0 +108,17.8,0 +99,13.6,0 +83,17.5,0 +109,15,0 +86,13.6,0 +102,14.6,0 +95,15,0 +87,17.1,0 +107,13.6,0 +117,13,0 +88,17.2,0 +105,14.7,0 +70,13.7,0 +89,15,0 +118,14.8,0 +81,15,0 +125,17.4,0 +82,14.9,0 +107,13.6,0 +83,16.2,0 +79,17.6,0 +109,15,0 +133,13.7,0 +111,16.3,0 +74,15.1,0 +88,16.4,0 +97,13.8,0 +78,16.1,0 +113,15.3,0 +75,16.8,0 +119,13.9,0 +132,15.4,0 +113,16.5,0 +100,16.4,0 +93,16.7,0 +94,15.5,0 +112,17,0 +99,15,0 +85,15.6,0 +133,14.8,0 +117,13,0 +137,14.1,0 +140,15.7,0 +75,16.5,0 +100,15.8,0 +114,14.2,0 +131,15.8,0