diff --git a/AdaBoost/ROC.py b/AdaBoost/ROC.py index 3f98c91..a833ea7 100644 --- a/AdaBoost/ROC.py +++ b/AdaBoost/ROC.py @@ -16,13 +16,12 @@ def loadDataSet(fileName): numFeat = len((open(fileName).readline().split('\t'))) - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): - lineArr = [] curLine = line.strip().split('\t') - for i in range(numFeat - 1): - lineArr.append(float(curLine[i])) + lineArr = [float(curLine[i]) for i in range(numFeat - 1)] dataMat.append(lineArr) labelMat.append(float(curLine[-1])) @@ -39,7 +38,7 @@ def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): Returns: retArray - 分类结果 """ - retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 + retArray = np.ones((np.shape(dataMatrix)[0], 1)) #初始化retArray为1 if threshIneq == 'lt': retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 else: @@ -59,18 +58,22 @@ def buildStump(dataArr,classLabels,D): minError - 最小误差 bestClasEst - 最佳的分类结果 """ - dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T + dataMatrix = np.mat(dataArr) + labelMat = np.mat(classLabels).T m,n = np.shape(dataMatrix) - numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) + numSteps = 10 + bestStump = {} + bestClasEst = np.mat(np.zeros((m, 1))) minError = float('inf') #最小误差初始化为正无穷大 for i in range(n): #遍历所有特征 - rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 + rangeMin = dataMatrix[:, i].min() + rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值 stepSize = (rangeMax - rangeMin) / numSteps #计算步长 for j in range(-1, int(numSteps) + 1): for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than threshVal = (rangeMin + float(j) * stepSize) #计算阈值 predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 - errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 + errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵 errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 weightedError = D.T * errArr #计算误差 # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) @@ -82,7 +85,7 @@ def buildStump(dataArr,classLabels,D): bestStump['ineq'] = inequal return bestStump, minError, bestClasEst -def adaBoostTrainDS(dataArr, classLabels, numIt = 40): +def adaBoostTrainDS(dataArr, classLabels, numIt=40): """ 使用AdaBoost算法训练分类器 Parameters: @@ -96,7 +99,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): weakClassArr = [] m = np.shape(dataArr)[0] D = np.mat(np.ones((m, 1)) / m) #初始化权重 - aggClassEst = np.mat(np.zeros((m,1))) + aggClassEst = np.mat(np.zeros((m, 1))) for i in range(numIt): bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 # print("D:",D.T) @@ -110,7 +113,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): #计算AdaBoost误差,当误差为0的时候,退出循环 aggClassEst += alpha * classEst #计算类别估计累计值 # print("aggClassEst: ", aggClassEst.T) - aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 + aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差 errorRate = aggErrors.sum() / m # print("total error: ", errorRate) if errorRate == 0.0: break #误差为0,退出循环 @@ -145,7 +148,7 @@ def plotROC(predStrengths, classLabels): ySum += cur[1] #高度累加 ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b') #绘制ROC cur = (cur[0] - delX, cur[1] - delY) #更新绘制光标的位置 - ax.plot([0,1], [0,1], 'b--') + ax.plot([0, 1], [0, 1], 'b--') plt.title('AdaBoost马疝病检测系统的ROC曲线', FontProperties = font) plt.xlabel('假阳率', FontProperties = font) plt.ylabel('真阳率', FontProperties = font) diff --git a/AdaBoost/adaboost.py b/AdaBoost/adaboost.py index 74bcac6..af5459f 100644 --- a/AdaBoost/adaboost.py +++ b/AdaBoost/adaboost.py @@ -28,7 +28,7 @@ def loadSimpData(): [ 1. , 1. ], [ 2. , 1. ]]) classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] - return datMat,classLabels + return datMat, classLabels def showDataSet(dataMat, labelMat): """ @@ -46,10 +46,8 @@ def showDataSet(dataMat, labelMat): data_plus.append(dataMat[i]) else: data_minus.append(dataMat[i]) - data_plus_np = np.array(data_plus) #转换为numpy矩阵 - data_minus_np = np.array(data_minus) #转换为numpy矩阵 - plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图 - plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图 + plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1]) #正样本散点图 + plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1]) #负样本散点图 plt.show() def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): @@ -63,7 +61,7 @@ def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): Returns: retArray - 分类结果 """ - retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 + retArray = np.ones((np.shape(dataMatrix)[0], 1)) #初始化retArray为1 if threshIneq == 'lt': retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 else: @@ -83,18 +81,22 @@ def buildStump(dataArr,classLabels,D): minError - 最小误差 bestClasEst - 最佳的分类结果 """ - dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T + dataMatrix = np.mat(dataArr) + labelMat = np.mat(classLabels).T m,n = np.shape(dataMatrix) - numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) + numSteps = 10.0 + bestStump = {} + bestClasEst = np.mat(np.zeros((m, 1))) minError = float('inf') #最小误差初始化为正无穷大 for i in range(n): #遍历所有特征 - rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 + rangeMin = dataMatrix[:, i].min() + rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值 stepSize = (rangeMax - rangeMin) / numSteps #计算步长 for j in range(-1, int(numSteps) + 1): for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than threshVal = (rangeMin + float(j) * stepSize) #计算阈值 predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 - errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 + errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵 errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 weightedError = D.T * errArr #计算误差 # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) @@ -106,7 +108,7 @@ def buildStump(dataArr,classLabels,D): bestStump['ineq'] = inequal return bestStump, minError, bestClasEst -def adaBoostTrainDS(dataArr, classLabels, numIt = 40): +def adaBoostTrainDS(dataArr, classLabels, numIt=40): """ 使用AdaBoost算法提升弱分类器性能 Parameters: @@ -120,7 +122,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): weakClassArr = [] m = np.shape(dataArr)[0] D = np.mat(np.ones((m, 1)) / m) #初始化权重 - aggClassEst = np.mat(np.zeros((m,1))) + aggClassEst = np.mat(np.zeros((m, 1))) for i in range(numIt): bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 # print("D:",D.T) @@ -134,10 +136,11 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): #计算AdaBoost误差,当误差为0的时候,退出循环 aggClassEst += alpha * classEst #计算类别估计累计值 # print("aggClassEst: ", aggClassEst.T) - aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 + aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差 errorRate = aggErrors.sum() / m # print("total error: ", errorRate) - if errorRate == 0.0: break #误差为0,退出循环 + if errorRate == 0.0: + break #误差为0,退出循环 return weakClassArr, aggClassEst @@ -152,7 +155,7 @@ def adaClassify(datToClass,classifierArr): """ dataMatrix = np.mat(datToClass) m = np.shape(dataMatrix)[0] - aggClassEst = np.mat(np.zeros((m,1))) + aggClassEst = np.mat(np.zeros((m, 1))) for i in range(len(classifierArr)): #遍历所有分类器,进行分类 classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) aggClassEst += classifierArr[i]['alpha'] * classEst diff --git a/AdaBoost/horse_adaboost.py b/AdaBoost/horse_adaboost.py index 8c5cb6d..e1a7b31 100644 --- a/AdaBoost/horse_adaboost.py +++ b/AdaBoost/horse_adaboost.py @@ -15,13 +15,12 @@ def loadDataSet(fileName): numFeat = len((open(fileName).readline().split('\t'))) - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): - lineArr = [] curLine = line.strip().split('\t') - for i in range(numFeat - 1): - lineArr.append(float(curLine[i])) + lineArr = [float(curLine[i]) for i in range(numFeat - 1)] dataMat.append(lineArr) labelMat.append(float(curLine[-1])) @@ -58,18 +57,22 @@ def buildStump(dataArr,classLabels,D): minError - 最小误差 bestClasEst - 最佳的分类结果 """ - dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T + dataMatrix = np.mat(dataArr) + labelMat = np.mat(classLabels).T m,n = np.shape(dataMatrix) - numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) + numSteps = 10 + bestStump = {} + bestClasEst = np.mat(np.zeros((m, 1))) minError = float('inf') #最小误差初始化为正无穷大 for i in range(n): #遍历所有特征 - rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 + rangeMin = dataMatrix[:, i].min() + rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值 stepSize = (rangeMax - rangeMin) / numSteps #计算步长 for j in range(-1, int(numSteps) + 1): for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than threshVal = (rangeMin + float(j) * stepSize) #计算阈值 predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 - errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 + errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵 errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 weightedError = D.T * errArr #计算误差 # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) @@ -81,7 +84,7 @@ def buildStump(dataArr,classLabels,D): bestStump['ineq'] = inequal return bestStump, minError, bestClasEst -def adaBoostTrainDS(dataArr, classLabels, numIt = 40): +def adaBoostTrainDS(dataArr, classLabels, numIt=40): """ 使用AdaBoost算法提升弱分类器性能 Parameters: @@ -95,7 +98,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): weakClassArr = [] m = np.shape(dataArr)[0] D = np.mat(np.ones((m, 1)) / m) #初始化权重 - aggClassEst = np.mat(np.zeros((m,1))) + aggClassEst = np.mat(np.zeros((m, 1))) for i in range(numIt): bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 # print("D:",D.T) @@ -109,10 +112,11 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40): #计算AdaBoost误差,当误差为0的时候,退出循环 aggClassEst += alpha * classEst #计算类别估计累计值 # print("aggClassEst: ", aggClassEst.T) - aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 + aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差 errorRate = aggErrors.sum() / m # print("total error: ", errorRate) - if errorRate == 0.0: break #误差为0,退出循环 + if errorRate == 0.0: + break #误差为0,退出循环 return weakClassArr, aggClassEst def adaClassify(datToClass,classifierArr): @@ -126,7 +130,7 @@ def adaClassify(datToClass,classifierArr): """ dataMatrix = np.mat(datToClass) m = np.shape(dataMatrix)[0] - aggClassEst = np.mat(np.zeros((m,1))) + aggClassEst = np.mat(np.zeros((m, 1))) for i in range(len(classifierArr)): #遍历所有分类器,进行分类 classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) aggClassEst += classifierArr[i]['alpha'] * classEst diff --git a/AdaBoost/sklearn_adaboost.py b/AdaBoost/sklearn_adaboost.py index 9056c60..6e7fdcd 100644 --- a/AdaBoost/sklearn_adaboost.py +++ b/AdaBoost/sklearn_adaboost.py @@ -16,13 +16,12 @@ def loadDataSet(fileName): numFeat = len((open(fileName).readline().split('\t'))) - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): - lineArr = [] curLine = line.strip().split('\t') - for i in range(numFeat - 1): - lineArr.append(float(curLine[i])) + lineArr = [float(curLine[i]) for i in range(numFeat - 1)] dataMat.append(lineArr) labelMat.append(float(curLine[-1])) diff --git a/Decision Tree/Decision Tree.py b/Decision Tree/Decision Tree.py index f0c7093..b3c657a 100644 --- a/Decision Tree/Decision Tree.py +++ b/Decision Tree/Decision Tree.py @@ -2,6 +2,7 @@ from matplotlib.font_manager import FontProperties import matplotlib.pyplot as plt from math import log +from collections import Counter import operator import pickle @@ -17,21 +18,20 @@ Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to simplify for. 2017-07-24 """ def calcShannonEnt(dataSet): - numEntires = len(dataSet) #返回数据集的行数 - labelCounts = {} #保存每个标签(Label)出现次数的字典 - for featVec in dataSet: #对每组特征向量进行统计 - currentLabel = featVec[-1] #提取标签(Label)信息 - if currentLabel not in labelCounts.keys(): #如果标签(Label)没有放入统计次数的字典,添加进去 - labelCounts[currentLabel] = 0 - labelCounts[currentLabel] += 1 #Label计数 - shannonEnt = 0.0 #经验熵(香农熵) - for key in labelCounts: #计算香农熵 - prob = float(labelCounts[key]) / numEntires #选择该标签(Label)的概率 - shannonEnt -= prob * log(prob, 2) #利用公式计算 - return shannonEnt #返回经验熵(香农熵) + # Count labels + label_count = Counter(data[-1] for data in dataSet) + # Compute prob + probs = [p[1] / len(dataSet) for p in label_count.items()] + # Compute entropy and sum + shannonEnt = sum([-p * log(p, 2) for p in probs]) + return shannonEnt """ 函数说明:创建测试数据集 @@ -81,15 +81,15 @@ def createDataSet(): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + It is not useful to delete some data, + so use list comprehension is more readable. 2017-07-24 """ -def splitDataSet(dataSet, axis, value): - retDataSet = [] #创建返回的数据集列表 - for featVec in dataSet: #遍历数据集 - if featVec[axis] == value: - reducedFeatVec = featVec[:axis] #去掉axis特征 - reducedFeatVec.extend(featVec[axis+1:]) #将符合条件的添加到返回的数据集 - retDataSet.append(reducedFeatVec) +def splitDataSet(dataSet, axis, value): + retDataSet = [data for data in dataSet for i, v in enumerate(data) if i == axis and v == value] return retDataSet #返回划分后的数据集 """ @@ -104,28 +104,28 @@ def splitDataSet(dataSet, axis, value): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension and sum to make the code more clear. 2017-07-20 """ def chooseBestFeatureToSplit(dataSet): - numFeatures = len(dataSet[0]) - 1 #特征数量 - baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵 - bestInfoGain = 0.0 #信息增益 - bestFeature = -1 #最优特征的索引值 - for i in range(numFeatures): #遍历所有特征 - #获取dataSet的第i个所有特征 - featList = [example[i] for example in dataSet] - uniqueVals = set(featList) #创建set集合{},元素不可重复 - newEntropy = 0.0 #经验条件熵 - for value in uniqueVals: #计算信息增益 - subDataSet = splitDataSet(dataSet, i, value) #subDataSet划分后的子集 - prob = len(subDataSet) / float(len(dataSet)) #计算子集的概率 - newEntropy += prob * calcShannonEnt(subDataSet) #根据公式计算经验条件熵 - infoGain = baseEntropy - newEntropy #信息增益 - # print("第%d个特征的增益为%.3f" % (i, infoGain)) #打印每个特征的信息增益 - if (infoGain > bestInfoGain): #计算信息增益 - bestInfoGain = infoGain #更新信息增益,找到最大的信息增益 - bestFeature = i #记录信息增益最大的特征的索引值 - return bestFeature #返回信息增益最大的特征的索引值 + base_entropy = calcShannonEnt(dataSet) #计算数据集的香农熵 + best_info_gain = 0 + best_feature = -1 + for i in range(len(dataSet[0]) - 1): # 遍历所有特征 + feature_count = Counter([data[i] for data in dataSet]) + # 计算信息增益 + new_entropy = sum(feature[1] / float(len(dataSet)) * calcShannonEnt(splitDataSet(dataSet, i, feature[0])) \ + for feature in feature_count.items()) + # 信息增益 + info_gain = base_entropy - new_entropy + # print('No. {0} feature info gain is {1:.3f}'.format(i, info_gain)) + if info_gain > best_info_gain: + best_info_gain = info_gain + best_feature = i + return best_feature """ @@ -140,15 +140,15 @@ def chooseBestFeatureToSplit(dataSet): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use Counter to clear code. 2017-07-24 """ def majorityCnt(classList): - classCount = {} - for vote in classList: #统计classList中每个元素出现的次数 - if vote not in classCount.keys():classCount[vote] = 0 - classCount[vote] += 1 - sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True) #根据字典的值降序排序 - return sortedClassCount[0][0] #返回classList中出现次数最多的元素 + major_label = Counter(classList).most_common(1)[0] + return major_label """ 函数说明:创建决策树 @@ -195,6 +195,10 @@ def createTree(dataSet, labels, featLabels): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use `is` instead of __name__ == 'dict' 2017-07-24 """ def getNumLeafs(myTree): @@ -202,9 +206,10 @@ def getNumLeafs(myTree): firstStr = next(iter(myTree)) #python3中myTree.keys()返回的是dict_keys,不在是list,所以不能使用myTree.keys()[0]的方法获取结点属性,可以使用list(myTree.keys())[0] secondDict = myTree[firstStr] #获取下一组字典 for key in secondDict.keys(): - if type(secondDict[key]).__name__=='dict': #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 + if type(secondDict[key]) is dict: #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 numLeafs += getNumLeafs(secondDict[key]) - else: numLeafs +=1 + else: + numLeafs += 1 return numLeafs """ @@ -219,6 +224,11 @@ def getNumLeafs(myTree): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use `is` instead of __name__ == 'dict' + Max is better than if... . 2017-07-24 """ def getTreeDepth(myTree): @@ -226,10 +236,11 @@ def getTreeDepth(myTree): firstStr = next(iter(myTree)) #python3中myTree.keys()返回的是dict_keys,不在是list,所以不能使用myTree.keys()[0]的方法获取结点属性,可以使用list(myTree.keys())[0] secondDict = myTree[firstStr] #获取下一个字典 for key in secondDict.keys(): - if type(secondDict[key]).__name__=='dict': #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 + if type(secondDict[key]) is dict: #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 thisDepth = 1 + getTreeDepth(secondDict[key]) - else: thisDepth = 1 - if thisDepth > maxDepth: maxDepth = thisDepth #更新层数 + else: + thisDepth = 1 + maxDepth = max(maxDepth ,thisDepth) #更新层数 return maxDepth """ @@ -272,8 +283,8 @@ def plotNode(nodeTxt, centerPt, parentPt, nodeType): 2017-07-24 """ def plotMidText(cntrPt, parentPt, txtString): - xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0] #计算标注位置 - yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1] + xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0] #计算标注位置 + yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1] createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30) """ @@ -290,6 +301,10 @@ def plotMidText(cntrPt, parentPt, txtString): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use `is` instead of __name__ == 'dict' 2017-07-24 """ def plotTree(myTree, parentPt, nodeTxt): @@ -298,19 +313,19 @@ def plotTree(myTree, parentPt, nodeTxt): numLeafs = getNumLeafs(myTree) #获取决策树叶结点数目,决定了树的宽度 depth = getTreeDepth(myTree) #获取决策树层数 firstStr = next(iter(myTree)) #下个字典 - cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) #中心位置 + cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff) #中心位置 plotMidText(cntrPt, parentPt, nodeTxt) #标注有向边属性值 plotNode(firstStr, cntrPt, parentPt, decisionNode) #绘制结点 secondDict = myTree[firstStr] #下一个字典,也就是继续绘制子结点 - plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD #y偏移 + plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD #y偏移 for key in secondDict.keys(): - if type(secondDict[key]).__name__=='dict': #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 + if type(secondDict[key]) is dict: #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点 plotTree(secondDict[key],cntrPt,str(key)) #不是叶结点,递归调用继续绘制 else: #如果是叶结点,绘制叶结点,并标注有向边属性值 - plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW + plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) - plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD + plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD """ 函数说明:创建绘制面板 @@ -333,8 +348,9 @@ def createPlot(inTree): createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #去掉x、y轴 plotTree.totalW = float(getNumLeafs(inTree)) #获取决策树叶结点数目 plotTree.totalD = float(getTreeDepth(inTree)) #获取决策树层数 - plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0; #x偏移 - plotTree(inTree, (0.5,1.0), '') #绘制决策树 + plotTree.xOff = -0.5 / plotTree.totalW + plotTree.yOff = 1.0 #x偏移 + plotTree(inTree, (0.5, 1.0), '') #绘制决策树 plt.show() #显示绘制结果 """ @@ -359,9 +375,10 @@ def classify(inputTree, featLabels, testVec): featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: - if type(secondDict[key]).__name__ == 'dict': + if type(secondDict[key]) is dict: classLabel = classify(secondDict[key], featLabels, testVec) - else: classLabel = secondDict[key] + else: + classLabel = secondDict[key] return classLabel """ diff --git a/Decision Tree/Sklearn-Decision Tree.py b/Decision Tree/Sklearn-Decision Tree.py index 86172e0..7fdaf17 100644 --- a/Decision Tree/Sklearn-Decision Tree.py +++ b/Decision Tree/Sklearn-Decision Tree.py @@ -3,7 +3,6 @@ from sklearn.externals.six import StringIO from sklearn import tree import pandas as pd -import numpy as np import pydotplus if __name__ == '__main__': @@ -24,21 +23,38 @@ lenses_list = [] # print(lenses_dict) #打印字典信息 lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame - # print(lenses_pd) #打印pandas.DataFrame - le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 + print(lenses_pd) #打印pandas.DataFrame + le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 for col in lenses_pd.columns: #序列化 lenses_pd[col] = le.fit_transform(lenses_pd[col]) # print(lenses_pd) #打印编码信息 + # ---------------- Second way ------------------------- + # 2017-11-09 by Cugtyt + # * GitHub(https://github.com/Cugtyt) + # * Email(cugtyt@qq.com) + # pandas can directly read file, so this way is more easy. + # + # + # lenses_pd = pd.DataFrame(pd.read_table('lenses.txt')) + # lenses_pd.columns = ['age', 'prescript', 'astigmatic', 'tearRate', 'target'] + # lenses_pd = lenses_pd.drop(['target'], axis=1) + # print(lenses_pd) + # le = LabelEncoder() + # for col in lenses_pd.columns: + # lenses_pd[col] = le.fit_transform(lenses_pd[col]) + # print(lenses_pd) + # ---------------------------------------------------- + clf = tree.DecisionTreeClassifier(max_depth = 4) #创建DecisionTreeClassifier()类 clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树 dot_data = StringIO() tree.export_graphviz(clf, out_file = dot_data, #绘制决策树 - feature_names = lenses_pd.keys(), - class_names = clf.classes_, - filled=True, rounded=True, - special_characters=True) + feature_names = lenses_pd.keys(), + class_names = clf.classes_, + filled=True, rounded=True, + special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("tree.pdf") #保存绘制好的决策树,以PDF的形式存储。 diff --git a/Logistic/LogRegres-gj.py b/Logistic/LogRegres-gj.py index b103e0e..26e9558 100644 --- a/Logistic/LogRegres-gj.py +++ b/Logistic/LogRegres-gj.py @@ -50,7 +50,7 @@ def loadDataSet(): 2017-08-28 """ def sigmoid(inX): - return 1.0 / (1 + np.exp(-inX)) + return 1 / (1 + np.exp(-inX)) """ 函数说明:梯度上升算法 @@ -72,17 +72,17 @@ def sigmoid(inX): """ def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) #转换成numpy的mat - labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 + labelMat = np.mat(classLabels).T #转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.01 #移动步长,也就是学习速率,控制更新的幅度。 maxCycles = 500 #最大迭代次数 - weights = np.ones((n,1)) + weights = np.ones((n, 1)) weights_array = np.array([]) for k in range(maxCycles): h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 error = labelMat - h weights = weights + alpha * dataMatrix.transpose() * error - weights_array = np.append(weights_array,weights) + weights_array = np.append(weights_array, weights) weights_array = weights_array.reshape(maxCycles,n) return weights.getA(),weights_array #将矩阵转换为数组,并返回 @@ -103,23 +103,25 @@ def gradAscent(dataMatIn, classLabels): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use random initialization, and random index, + delete used data is not needed. 2017-08-31 """ def stocGradAscent1(dataMatrix, classLabels, numIter=150): - m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 - weights = np.ones(n) #参数初始化 + m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 + weights = np.random.randn(n) #参数初始化 weights_array = np.array([]) #存储每次更新的回归系数 - for j in range(numIter): - dataIndex = list(range(m)) - for i in range(m): - alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 - randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 - h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h - error = classLabels[randIndex] - h #计算误差 - weights = weights + alpha * error * dataMatrix[randIndex] #更新回归系数 - weights_array = np.append(weights_array,weights,axis=0) #添加回归系数到数组中 - del(dataIndex[randIndex]) #删除已经使用的样本 - weights_array = weights_array.reshape(numIter*m,n) #改变维度 + for j in range(numIter * m): + alpha = 4 / (1.0 + j) + 0.01 #降低alpha的大小,每次减小1/(j+i)。 + randIndex = random.randint(0, m - 1) #随机选取样本 + h = sigmoid(sum(dataMatrix[randIndex] * weights)) #选择随机选取的一个样本,计算h + error = classLabels[randIndex] - h #计算误差 + weights = weights + alpha * error * dataMatrix[randIndex] #更新回归系数 + weights_array = np.append(weights_array, weights, axis=0) #添加回归系数到数组中 + weights_array = weights_array.reshape(numIter*m, n) #改变维度 return weights,weights_array #返回 """ @@ -142,17 +144,21 @@ def plotBestFit(weights): dataMat, labelMat = loadDataSet() #加载数据集 dataArr = np.array(dataMat) #转换成numpy的array数组 n = np.shape(dataMat)[0] #数据个数 - xcord1 = []; ycord1 = [] #正样本 - xcord2 = []; ycord2 = [] #负样本 + xcord1 = [] + ycord1 = [] #正样本 + xcord2 = [] + ycord2 = [] #负样本 for i in range(n): #根据数据集标签进行分类 if int(labelMat[i]) == 1: - xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 + xcord1.append(dataArr[i,1]) + ycord1.append(dataArr[i,2]) #1为正样本 else: - xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 + xcord2.append(dataArr[i,1]) + ycord2.append(dataArr[i,2]) #0为负样本 fig = plt.figure() ax = fig.add_subplot(111) #添加subplot - ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 - ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 + ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's', alpha=.5)#绘制正样本 + ax.scatter(xcord2, ycord2, s = 20, c = 'green', alpha=.5) #绘制负样本 x = np.arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) @@ -182,22 +188,22 @@ def plotWeights(weights_array1,weights_array2): font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) #将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8) #当nrow=3,nclos=2时,代表fig画布被分为六个区域,axs[0][0]表示第一行第一列 - fig, axs = plt.subplots(nrows=3, ncols=2,sharex=False, sharey=False, figsize=(20,10)) + fig, axs = plt.subplots(nrows=3, ncols=2, sharex=False, sharey=False, figsize=(20,10)) x1 = np.arange(0, len(weights_array1), 1) #绘制w0与迭代次数的关系 - axs[0][0].plot(x1,weights_array1[:,0]) - axs0_title_text = axs[0][0].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系',FontProperties=font) - axs0_ylabel_text = axs[0][0].set_ylabel(u'W0',FontProperties=font) + axs[0][0].plot(x1, weights_array1[:, 0]) + axs0_title_text = axs[0][0].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系', FontProperties=font) + axs0_ylabel_text = axs[0][0].set_ylabel(u'W0', FontProperties=font) plt.setp(axs0_title_text, size=20, weight='bold', color='black') plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') #绘制w1与迭代次数的关系 - axs[1][0].plot(x1,weights_array1[:,1]) - axs1_ylabel_text = axs[1][0].set_ylabel(u'W1',FontProperties=font) + axs[1][0].plot(x1, weights_array1[:,1]) + axs1_ylabel_text = axs[1][0].set_ylabel(u'W1', FontProperties=font) plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') #绘制w2与迭代次数的关系 axs[2][0].plot(x1,weights_array1[:,2]) - axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数',FontProperties=font) - axs2_ylabel_text = axs[2][0].set_ylabel(u'W1',FontProperties=font) + axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数', FontProperties=font) + axs2_ylabel_text = axs[2][0].set_ylabel(u'W1', FontProperties=font) plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') @@ -205,18 +211,18 @@ def plotWeights(weights_array1,weights_array2): x2 = np.arange(0, len(weights_array2), 1) #绘制w0与迭代次数的关系 axs[0][1].plot(x2,weights_array2[:,0]) - axs0_title_text = axs[0][1].set_title(u'梯度上升算法:回归系数与迭代次数关系',FontProperties=font) - axs0_ylabel_text = axs[0][1].set_ylabel(u'W0',FontProperties=font) + axs0_title_text = axs[0][1].set_title(u'梯度上升算法:回归系数与迭代次数关系', FontProperties=font) + axs0_ylabel_text = axs[0][1].set_ylabel(u'W0', FontProperties=font) plt.setp(axs0_title_text, size=20, weight='bold', color='black') plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') #绘制w1与迭代次数的关系 axs[1][1].plot(x2,weights_array2[:,1]) - axs1_ylabel_text = axs[1][1].set_ylabel(u'W1',FontProperties=font) + axs1_ylabel_text = axs[1][1].set_ylabel(u'W1', FontProperties=font) plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') #绘制w2与迭代次数的关系 axs[2][1].plot(x2,weights_array2[:,2]) - axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数',FontProperties=font) - axs2_ylabel_text = axs[2][1].set_ylabel(u'W1',FontProperties=font) + axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数', FontProperties=font) + axs2_ylabel_text = axs[2][1].set_ylabel(u'W1', FontProperties=font) plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') diff --git a/Logistic/LogRegres.py b/Logistic/LogRegres.py index 2d00841..af70034 100644 --- a/Logistic/LogRegres.py +++ b/Logistic/LogRegres.py @@ -77,7 +77,7 @@ def loadDataSet(): 2017-08-28 """ def sigmoid(inX): - return 1.0 / (1 + np.exp(-inX)) + return 1 / (1 + np.exp(-inX)) """ 函数说明:梯度上升算法 @@ -98,15 +98,15 @@ def sigmoid(inX): """ def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) #转换成numpy的mat - labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 + labelMat = np.mat(classLabels).T #转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.001 #移动步长,也就是学习速率,控制更新的幅度。 maxCycles = 500 #最大迭代次数 - weights = np.ones((n,1)) + weights = np.ones((n, 1)) for k in range(maxCycles): h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 error = labelMat - h - weights = weights + alpha * dataMatrix.transpose() * error + weights = weights + alpha * dataMatrix.T * error return weights.getA() #将矩阵转换为数组,返回权重数组 """ @@ -129,19 +129,24 @@ def plotDataSet(): dataMat, labelMat = loadDataSet() #加载数据集 dataArr = np.array(dataMat) #转换成numpy的array数组 n = np.shape(dataMat)[0] #数据个数 - xcord1 = []; ycord1 = [] #正样本 - xcord2 = []; ycord2 = [] #负样本 + xcord1 = [] + ycord1 = [] #正样本 + xcord2 = [] + ycord2 = [] #负样本 for i in range(n): #根据数据集标签进行分类 if int(labelMat[i]) == 1: - xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 + xcord1.append(dataArr[i, 1]) + ycord1.append(dataArr[i,2]) #1为正样本 else: - xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 + xcord2.append(dataArr[i,1]) + ycord2.append(dataArr[i,2]) #0为负样本 fig = plt.figure() ax = fig.add_subplot(111) #添加subplot - ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 - ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 + ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's', alpha=.5)#绘制正样本 + ax.scatter(xcord2, ycord2, s = 20, c = 'green', alpha=.5) #绘制负样本 plt.title('DataSet') #绘制title - plt.xlabel('X1'); plt.ylabel('X2') #绘制label + plt.xlabel('X1') + plt.ylabel('X2') #绘制label plt.show() #显示 """ @@ -164,13 +169,17 @@ def plotBestFit(weights): dataMat, labelMat = loadDataSet() #加载数据集 dataArr = np.array(dataMat) #转换成numpy的array数组 n = np.shape(dataMat)[0] #数据个数 - xcord1 = []; ycord1 = [] #正样本 - xcord2 = []; ycord2 = [] #负样本 + xcord1 = [] + ycord1 = [] #正样本 + xcord2 = [] + ycord2 = [] #负样本 for i in range(n): #根据数据集标签进行分类 if int(labelMat[i]) == 1: - xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 + xcord1.append(dataArr[i, 1]) + ycord1.append(dataArr[i, 2]) #1为正样本 else: - xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 + xcord2.append(dataArr[i, 1]) + ycord2.append(dataArr[i, 2]) #0为负样本 fig = plt.figure() ax = fig.add_subplot(111) #添加subplot ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 @@ -179,7 +188,8 @@ def plotBestFit(weights): y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) plt.title('BestFit') #绘制title - plt.xlabel('X1'); plt.ylabel('X2') #绘制label + plt.xlabel('X1') + plt.ylabel('X2') #绘制label plt.show() if __name__ == '__main__': diff --git a/Logistic/colicLogRegres.py b/Logistic/colicLogRegres.py index 57bc240..d491fe3 100644 --- a/Logistic/colicLogRegres.py +++ b/Logistic/colicLogRegres.py @@ -20,7 +20,7 @@ 2017-09-05 """ def sigmoid(inX): - return 1.0 / (1 + np.exp(-inX)) + return 1 / (1 + np.exp(-inX)) """ 函数说明:改进的随机梯度上升算法 @@ -38,21 +38,23 @@ def sigmoid(inX): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use random initialization, and random index, + delete used data is not needed. 2017-09-05 """ def stocGradAscent1(dataMatrix, classLabels, numIter=150): - m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 - weights = np.ones(n) #参数初始化 #存储每次更新的回归系数 - for j in range(numIter): - dataIndex = list(range(m)) - for i in range(m): - alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 - randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 - h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h - error = classLabels[randIndex] - h #计算误差 - weights = weights + alpha * error * dataMatrix[randIndex] #更新回归系数 - del(dataIndex[randIndex]) #删除已经使用的样本 - return weights #返回 + m, n = np.shape(dataMatrix) + weights = np.random.randn(n) + for j in range(numIter * m): + alpha = 1 / (1.0 + j) + 0.01 # 降低alpha的大小,每次减小1/(j+i)。 + randIndex = random.randint(0, m - 1) # 随机选取样本 + h = sigmoid(sum(dataMatrix[randIndex] * weights)) # 选择随机选取的一个样本,计算h + error = classLabels[randIndex] - h # 计算误差 + weights = weights + alpha * error * dataMatrix[randIndex] # 更新回归系数 + return weights #返回 """ @@ -74,11 +76,11 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150): """ def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) #转换成numpy的mat - labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 + labelMat = np.mat(classLabels).T #转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.01 #移动步长,也就是学习速率,控制更新的幅度。 maxCycles = 500 #最大迭代次数 - weights = np.ones((n,1)) + weights = np.ones((n, 1)) for k in range(maxCycles): h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 error = labelMat - h @@ -101,30 +103,31 @@ def gradAscent(dataMatIn, classLabels): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify for by list comprehension, and if... by compute directly. 2017-09-05 """ def colicTest(): frTrain = open('horseColicTraining.txt') #打开训练集 frTest = open('horseColicTest.txt') #打开测试集 - trainingSet = []; trainingLabels = [] + trainingSet = [] + trainingLabels = [] for line in frTrain.readlines(): currLine = line.strip().split('\t') - lineArr = [] - for i in range(len(currLine)-1): - lineArr.append(float(currLine[i])) + lineArr = [float(currLine[i]) for i in range(len(currLine) - 1)] trainingSet.append(lineArr) trainingLabels.append(float(currLine[-1])) - trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels,500) #使用改进的随即上升梯度训练 - errorCount = 0; numTestVec = 0.0 + trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) #使用改进的随即上升梯度训练 + errorCount = 0 + numTestVec = 0 for line in frTest.readlines(): - numTestVec += 1.0 + numTestVec += 1 currLine = line.strip().split('\t') - lineArr =[] - for i in range(len(currLine)-1): - lineArr.append(float(currLine[i])) - if int(classifyVector(np.array(lineArr), trainWeights))!= int(currLine[-1]): - errorCount += 1 - errorRate = (float(errorCount)/numTestVec) * 100 #错误率计算 + lineArr = [float(currLine[i]) for i in range(len(currLine) - 1)] + errorCount += int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[-1]) + errorRate = (errorCount / numTestVec) * 100 #错误率计算 print("测试集错误率为: %.2f%%" % errorRate) """ @@ -142,12 +145,15 @@ def colicTest(): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify return. 2017-09-05 """ def classifyVector(inX, weights): prob = sigmoid(sum(inX*weights)) - if prob > 0.5: return 1.0 - else: return 0.0 + return prob > 0.5 """ 函数说明:使用Sklearn构建Logistic回归分类器 @@ -163,25 +169,27 @@ def classifyVector(inX, weights): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify for by list comprehension. 2017-09-05 """ def colicSklearn(): frTrain = open('horseColicTraining.txt') #打开训练集 frTest = open('horseColicTest.txt') #打开测试集 - trainingSet = []; trainingLabels = [] - testSet = []; testLabels = [] + trainingSet = [] + trainingLabels = [] + testSet = [] + testLabels = [] for line in frTrain.readlines(): currLine = line.strip().split('\t') - lineArr = [] - for i in range(len(currLine)-1): - lineArr.append(float(currLine[i])) + lineArr = [float(currLine[i]) for i in range(len(currLine) - 1)] trainingSet.append(lineArr) trainingLabels.append(float(currLine[-1])) for line in frTest.readlines(): currLine = line.strip().split('\t') - lineArr =[] - for i in range(len(currLine)-1): - lineArr.append(float(currLine[i])) + lineArr = [float(currLine[i]) for i in range(len(currLine) - 1)] testSet.append(lineArr) testLabels.append(float(currLine[-1])) classifier = LogisticRegression(solver = 'sag',max_iter = 5000).fit(trainingSet, trainingLabels) diff --git a/Naive Bayes/bayes-modify.py b/Naive Bayes/bayes-modify.py index b025cf4..4802fd8 100644 --- a/Naive Bayes/bayes-modify.py +++ b/Naive Bayes/bayes-modify.py @@ -16,13 +16,15 @@ Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to clear code. 2017-08-11 """ def createVocabList(dataSet): - vocabSet = set([]) #创建一个空的不重复列表 - for document in dataSet: - vocabSet = vocabSet | set(document) #取并集 - return list(vocabSet) + vocabSet = [d for data in dataSet for d in data] + return list(set(vocabSet)) """ 函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0 @@ -37,15 +39,15 @@ def createVocabList(dataSet): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to clear code. 2017-08-11 """ def setOfWords2Vec(vocabList, inputSet): - returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 - for word in inputSet: #遍历每个词条 - if word in vocabList: #如果词条存在于词汇表中,则置1 - returnVec[vocabList.index(word)] = 1 - else: print("the word: %s is not in my Vocabulary!" % word) - return returnVec #返回文档向量 + returnVec = [int(val in inputSet) for val in vocabList] + return returnVec """ @@ -64,7 +66,7 @@ def setOfWords2Vec(vocabList, inputSet): 2017-08-14 """ def bagOfWords2VecMN(vocabList, inputSet): - returnVec = [0]*len(vocabList) #创建一个其中所含元素都为0的向量 + returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 for word in inputSet: #遍历每个词条 if word in vocabList: #如果词条存在于词汇表中,则计数加一 returnVec[vocabList.index(word)] += 1 @@ -90,9 +92,10 @@ def bagOfWords2VecMN(vocabList, inputSet): def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) #计算训练的文档数目 numWords = len(trainMatrix[0]) #计算每篇文档的词条数 - pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率 + pAbusive = sum(trainCategory) / numTrainDocs #文档属于侮辱类的概率 p0Num = np.ones(numWords); p1Num = np.ones(numWords) #创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑 - p0Denom = 2.0; p1Denom = 2.0 #分母初始化为2,拉普拉斯平滑 + p0Denom = 2 + p1Denom = 2 #分母初始化为2,拉普拉斯平滑 for i in range(numTrainDocs): if trainCategory[i] == 1: #统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· p1Num += trainMatrix[i] @@ -100,8 +103,8 @@ def trainNB0(trainMatrix,trainCategory): else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) - p1Vect = np.log(p1Num/p1Denom) #取对数,防止下溢出 - p0Vect = np.log(p0Num/p0Denom) + p1Vect = np.log(p1Num / p1Denom) #取对数,防止下溢出 + p0Vect = np.log(p0Num / p0Denom) return p0Vect,p1Vect,pAbusive #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 """ @@ -120,15 +123,16 @@ def trainNB0(trainMatrix,trainCategory): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify return. 2017-08-12 """ def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) #对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1) p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) - if p1 > p0: - return 1 - else: - return 0 + return p1 > p0 """ 函数说明:接收一个大字符串并将其解析为字符串列表 @@ -163,7 +167,9 @@ def textParse(bigString): #将 2017-08-14 """ def spamTest(): - docList = []; classList = []; fullText = [] + docList = [] + classList = [] + fullText = [] for i in range(1, 26): #遍历25个txt文件 wordList = textParse(open('email/spam/%d.txt' % i, 'r').read()) #读取每个垃圾邮件,并字符串转换成字符串列表 docList.append(wordList) @@ -174,12 +180,14 @@ def spamTest(): fullText.append(wordList) classList.append(0) #标记非垃圾邮件,1表示垃圾文件 vocabList = createVocabList(docList) #创建词汇表,不重复 - trainingSet = list(range(50)); testSet = [] #创建存储训练集的索引值的列表和测试集的索引值的列表 + trainingSet = list(range(50)) + testSet = [] #创建存储训练集的索引值的列表和测试集的索引值的列表 for i in range(10): #从50个邮件中,随机挑选出40个作为训练集,10个做测试集 - randIndex = int(random.uniform(0, len(trainingSet))) #随机选取索索引值 + randIndex = random.randint(0, len(trainingSet) - 1) #随机选取索索引值 testSet.append(trainingSet[randIndex]) #添加测试集的索引值 del(trainingSet[randIndex]) #在训练集列表中删除添加到测试集的索引值 - trainMat = []; trainClasses = [] #创建训练集矩阵和训练集类别标签系向量 + trainMat = [] + trainClasses = [] #创建训练集矩阵和训练集类别标签系向量 for docIndex in trainingSet: #遍历训练集 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) #将生成的词集模型添加到训练矩阵中 trainClasses.append(classList[docIndex]) #将类别添加到训练集类别标签系向量中 @@ -187,8 +195,8 @@ def spamTest(): errorCount = 0 #错误分类计数 for docIndex in testSet: #遍历测试集 wordVector = setOfWords2Vec(vocabList, docList[docIndex]) #测试集的词集模型 - if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: #如果分类错误 - errorCount += 1 #错误计数加1 + #如果分类错误 + errorCount += classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex] #错误计数加1 print("分类错误的测试集:",docList[docIndex]) print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100)) diff --git a/Naive Bayes/bayes.py b/Naive Bayes/bayes.py index e2425ee..c8d1e41 100644 --- a/Naive Bayes/bayes.py +++ b/Naive Bayes/bayes.py @@ -39,13 +39,15 @@ def loadDataSet(): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to clear code. 2017-08-11 """ def createVocabList(dataSet): - vocabSet = set([]) #创建一个空的不重复列表 - for document in dataSet: - vocabSet = vocabSet | set(document) #取并集 - return list(vocabSet) + vocabSet = [d for data in dataSet for d in data] + return list(set(vocabSet)) """ 函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0 @@ -60,15 +62,15 @@ def createVocabList(dataSet): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to clear code. 2017-08-11 """ def setOfWords2Vec(vocabList, inputSet): - returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 - for word in inputSet: #遍历每个词条 - if word in vocabList: #如果词条存在于词汇表中,则置1 - returnVec[vocabList.index(word)] = 1 - else: print("the word: %s is not in my Vocabulary!" % word) - return returnVec #返回文档向量 + returnVec = [int(val in inputSet) for val in vocabList] + return returnVec """ @@ -86,14 +88,20 @@ def setOfWords2Vec(vocabList, inputSet): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-09 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Remove float(), it is no needed to number. 2017-08-12 """ def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) #计算训练的文档数目 numWords = len(trainMatrix[0]) #计算每篇文档的词条数 - pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率 - p0Num = np.zeros(numWords); p1Num = np.zeros(numWords) #创建numpy.zeros数组, - p0Denom = 0.0; p1Denom = 0.0 #分母初始化为0.0 + pAbusive = sum(trainCategory) / numTrainDocs #文档属于侮辱类的概率 + p0Num = np.zeros(numWords) + p1Num = np.zeros(numWords) #创建numpy.zeros数组, + p0Denom = 0 + p1Denom = 0 #分母初始化为0.0 for i in range(numTrainDocs): if trainCategory[i] == 1: #统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· p1Num += trainMatrix[i] @@ -101,9 +109,9 @@ def trainNB0(trainMatrix,trainCategory): else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) - p1Vect = p1Num/p1Denom #相除 - p0Vect = p0Num/p0Denom - return p0Vect,p1Vect,pAbusive #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 + p1Vect = p1Num / p1Denom #相除 + p0Vect = p0Num / p0Denom + return p0Vect, p1Vect, pAbusive #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 """ 函数说明:朴素贝叶斯分类器分类函数 @@ -121,17 +129,18 @@ def trainNB0(trainMatrix,trainCategory): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify return. 2017-08-12 """ def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): - p1 = reduce(lambda x,y:x*y, vec2Classify * p1Vec) * pClass1 #对应元素相乘 - p0 = reduce(lambda x,y:x*y, vec2Classify * p0Vec) * (1.0 - pClass1) + p1 = reduce(lambda x, y: x * y, vec2Classify * p1Vec) * pClass1 #对应元素相乘 + p0 = reduce(lambda x, y: x * y, vec2Classify * p0Vec) * (1.0 - pClass1) print('p0:',p0) print('p1:',p1) - if p1 > p0: - return 1 - else: - return 0 + return p1 > p0 """ 函数说明:测试朴素贝叶斯分类器 @@ -145,14 +154,16 @@ def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension to clear code. 2017-08-12 """ def testingNB(): - listOPosts,listClasses = loadDataSet() #创建实验样本 + listOPosts, listClasses = loadDataSet() #创建实验样本 myVocabList = createVocabList(listOPosts) #创建词汇表 - trainMat=[] - for postinDoc in listOPosts: - trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) #将实验样本向量化 + trainMat = [setOfWords2Vec(myVocabList, postinDoc) for postinDoc in listOPosts] #将实验样本向量化 p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses)) #训练朴素贝叶斯分类器 testEntry = ['love', 'my', 'dalmation'] #测试样本1 thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) #测试样本向量化 diff --git a/Naive Bayes/nbc.py b/Naive Bayes/nbc.py index da9a521..011bfb4 100644 --- a/Naive Bayes/nbc.py +++ b/Naive Bayes/nbc.py @@ -1,4 +1,5 @@ # -*- coding: UTF-8 -*- +from collections import defaultdict from sklearn.naive_bayes import MultinomialNB import matplotlib.pyplot as plt import os @@ -22,6 +23,10 @@ Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use enumerate and defaultdict to make code more readable. 2017-08-22 """ def TextProcessing(folder_path, test_size = 0.2): @@ -34,9 +39,8 @@ def TextProcessing(folder_path, test_size = 0.2): new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径 files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 - j = 1 #遍历每个txt文件 - for file in files: + for j, file in enumerate(files): if j > 100: #每类txt样本数最多100个 break with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件 @@ -47,7 +51,6 @@ def TextProcessing(folder_path, test_size = 0.2): data_list.append(word_list) #添加数据集数据 class_list.append(folder) #添加数据集类别 - j += 1 data_class_list = list(zip(data_list, class_list)) #zip压缩合并,将数据与标签对应压缩 random.shuffle(data_class_list) #将data_class_list乱序 @@ -57,14 +60,11 @@ def TextProcessing(folder_path, test_size = 0.2): train_data_list, train_class_list = zip(*train_list) #训练集解压缩 test_data_list, test_class_list = zip(*test_list) #测试集解压缩 - all_words_dict = {} #统计训练集词频 + all_words_dict = defaultdict(int) # 统计训练集词频 for word_list in train_data_list: for word in word_list: - if word in all_words_dict.keys(): - all_words_dict[word] += 1 - else: - all_words_dict[word] = 1 - + all_words_dict[word] += 1 + #根据键的值倒序排序 all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) all_words_list, all_words_nums = zip(*all_words_tuple_list) #解压缩 @@ -135,18 +135,20 @@ def text_features(text, feature_words): #出现在特征集中,则置1 Blog: http://blog.csdn.net/c406495762 Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use enumerate to simplify iteration. 2017-08-22 """ def words_dict(all_words_list, deleteN, stopwords_set = set()): feature_words = [] #特征列表 - n = 1 - for t in range(deleteN, len(all_words_list), 1): + for n, t in enumerate(range(deleteN, len(all_words_list), 1)): if n > 1000: #feature_words的维度为1000 break #如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词 if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: feature_words.append(all_words_list[t]) - n += 1 return feature_words """ diff --git a/SVM/svm-digits.py b/SVM/svm-digits.py index c19ee6b..e59f32b 100644 --- a/SVM/svm-digits.py +++ b/SVM/svm-digits.py @@ -30,12 +30,12 @@ def __init__(self, dataMatIn, classLabels, C, toler, kTup): self.C = C #松弛变量 self.tol = toler #容错率 self.m = np.shape(dataMatIn)[0] #数据矩阵行数 - self.alphas = np.mat(np.zeros((self.m,1))) #根据矩阵行数初始化alpha参数为0 + self.alphas = np.mat(np.zeros((self.m, 1))) #根据矩阵行数初始化alpha参数为0 self.b = 0 #初始化b参数为0 - self.eCache = np.mat(np.zeros((self.m,2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 - self.K = np.mat(np.zeros((self.m,self.m))) #初始化核K + self.eCache = np.mat(np.zeros((self.m, 2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 + self.K = np.mat(np.zeros((self.m, self.m))) #初始化核K for i in range(self.m): #计算所有数据的核K - self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup) + self.K[:, i] = kernelTrans(self.X, self.X[i], kTup) def kernelTrans(X, A, kTup): """ @@ -48,14 +48,16 @@ def kernelTrans(X, A, kTup): K - 计算的核K """ m,n = np.shape(X) - K = np.mat(np.zeros((m,1))) - if kTup[0] == 'lin': K = X * A.T #线性核函数,只进行内积。 + K = np.mat(np.zeros((m, 1))) + if kTup[0] == 'lin': + K = X * A.T #线性核函数,只进行内积。 elif kTup[0] == 'rbf': #高斯核函数,根据高斯核函数公式进行计算 for j in range(m): - deltaRow = X[j,:] - A - K[j] = deltaRow*deltaRow.T - K = np.exp(K/(-1*kTup[1]**2)) #计算高斯核K - else: raise NameError('核函数无法识别') + deltaRow = X[j] - A + K[j] = deltaRow * deltaRow.T + K = np.exp(- K / kTup[1]**2) #计算高斯核K + else: + raise NameError('核函数无法识别') return K #返回计算的核K def loadDataSet(fileName): @@ -67,7 +69,8 @@ def loadDataSet(fileName): dataMat - 数据矩阵 labelMat - 数据标签 """ - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): #逐行读取,滤除空格等 lineArr = line.strip().split('\t') @@ -84,7 +87,7 @@ def calcEk(oS, k): Returns: Ek - 标号为k的数据误差 """ - fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) + fXk = np.multiply(oS.alphas, oS.labelMat).T*oS.K[:,k] + oS.b Ek = fXk - float(oS.labelMat[k]) return Ek @@ -99,8 +102,8 @@ def selectJrand(i, m): j - alpha_j的索引值 """ j = i #选择一个不等于i的j - while (j == i): - j = int(random.uniform(0, m)) + while j == i: + j = random.randint(0, m - 1) return j def selectJ(i, oS, Ei): @@ -114,16 +117,21 @@ def selectJ(i, oS, Ei): j, maxK - 标号为j或maxK的数据的索引值 Ej - 标号为j的数据误差 """ - maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 - oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 - validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 + maxK = -1 + maxDeltaE = 0 + Ej = 0 #初始化 + oS.eCache[i] = [1, Ei] #根据Ei更新误差缓存 + validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] #返回误差不为0的数据的索引值 if (len(validEcacheList)) > 1: #有不为0的误差 for k in validEcacheList: #遍历,找到最大的Ek - if k == i: continue #不计算i,浪费时间 + if k == i: + continue #不计算i,浪费时间 Ek = calcEk(oS, k) #计算Ek deltaE = abs(Ei - Ek) #计算|Ei-Ek| - if (deltaE > maxDeltaE): #找到maxDeltaE - maxK = k; maxDeltaE = deltaE; Ej = Ek + if deltaE > maxDeltaE: #找到maxDeltaE + maxK = k + maxDeltaE = deltaE + Ej = Ek return maxK, Ej #返回maxK,Ej else: #没有不为0的误差 j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 @@ -140,7 +148,7 @@ def updateEk(oS, k): 无 """ Ek = calcEk(oS, k) #计算Ek - oS.eCache[k] = [1,Ek] #更新误差缓存 + oS.eCache[k] = [1, Ek] #更新误差缓存 def clipAlpha(aj,H,L): @@ -153,10 +161,8 @@ def clipAlpha(aj,H,L): Returns: aj - 修剪后的alpah_j的值 """ - if aj > H: - aj = H - if L > aj: - aj = L + aj = min(aj, H) + aj = max(L, aj) return aj def innerL(i, oS): @@ -174,11 +180,12 @@ def innerL(i, oS): #优化alpha,设定一定的容错率。 if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): #使用内循环启发方式2选择alpha_j,并计算Ej - j,Ej = selectJ(i, oS, Ei) + j, Ej = selectJ(i, oS, Ei) #保存更新前的aplpha值,使用深拷贝 - alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); + alphaIold = oS.alphas[i].copy() + alphaJold = oS.alphas[j].copy() #步骤2:计算上下界L和H - if (oS.labelMat[i] != oS.labelMat[j]): + if oS.labelMat[i] != oS.labelMat[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: @@ -188,35 +195,41 @@ def innerL(i, oS): print("L==H") return 0 #步骤3:计算eta - eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] - if eta >= 0: + eta = oS.X[i] - oS.X[j] + eta = - eta * eta.T + if eta >= 0: print("eta>=0") return 0 #步骤4:更新alpha_j - oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta + oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta #步骤5:修剪alpha_j - oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) + oS.alphas[j] = clipAlpha(oS.alphas[j], H, L) #更新Ej至误差缓存 updateEk(oS, j) - if (abs(oS.alphas[j] - alphaJold) < 0.00001): + if abs(oS.alphas[j] - alphaJold) < 0.00001: print("alpha_j变化太小") return 0 #步骤6:更新alpha_i - oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) + oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) #更新Ei至误差缓存 updateEk(oS, i) #步骤7:更新b_1和b_2 - b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j] - b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j] + b1 = oS.b - Ei- oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[i, j] + b2 = oS.b - Ej- oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[j, j] #步骤8:根据b_1和b_2更新b - if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 - elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 - else: oS.b = (b1 + b2)/2.0 + if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): + oS.b = b1 + elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): + oS.b = b2 + else: + oS.b = (b1 + b2) / 2 return 1 else: return 0 -def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): +def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin', 0)): """ 完整的线性SMO算法 Parameters: @@ -230,28 +243,24 @@ def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): oS.b - SMO算法计算的b oS.alphas - SMO算法计算的alphas """ - oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler, kTup) #初始化数据结构 + oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).T, C, toler, kTup) #初始化数据结构 iter = 0 #初始化当前迭代次数 - entireSet = True; alphaPairsChanged = 0 + entireSet = True + alphaPairsChanged = 0 while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 alphaPairsChanged = 0 if entireSet: #遍历整个数据集 - for i in range(oS.m): - alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 - print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + alphaPairsChanged += sum(innerL(i, oS) for i in range(oS.m)) else: #遍历非边界值 nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha - for i in nonBoundIs: - alphaPairsChanged += innerL(i,oS) - print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + alphaPairsChanged += sum(innerL(i, oS) for i in nonBoundIs) + iter += 1 if entireSet: #遍历一次后改为非边界遍历 entireSet = False - elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 + elif alphaPairsChanged == 0: #如果alpha没有更新,计算全样本遍历 entireSet = True print("迭代次数: %d" % iter) - return oS.b,oS.alphas #返回SMO算法计算的b和alphas + return oS.b, oS.alphas #返回SMO算法计算的b和alphas def img2vector(filename): @@ -262,12 +271,12 @@ def img2vector(filename): Returns: returnVect - 返回的二进制图像的1x1024向量 """ - returnVect = np.zeros((1,1024)) + returnVect = np.zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): - returnVect[0,32*i+j] = int(lineStr[j]) + returnVect[0, 32 * i + j] = int(lineStr[j]) return returnVect def loadImages(dirName): @@ -283,14 +292,13 @@ def loadImages(dirName): hwLabels = [] trainingFileList = listdir(dirName) m = len(trainingFileList) - trainingMat = np.zeros((m,1024)) + trainingMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) - if classNumStr == 9: hwLabels.append(-1) - else: hwLabels.append(1) - trainingMat[i,:] = img2vector('%s/%s' % (dirName, fileNameStr)) + hwLabels.append(-1 if classNumStr == 9 else 1) + trainingMat[i] = img2vector('%s/%s' % (dirName, fileNameStr)) return trainingMat, hwLabels def testDigits(kTup=('rbf', 10)): @@ -301,29 +309,30 @@ def testDigits(kTup=('rbf', 10)): Returns: 无 """ - dataArr,labelArr = loadImages('trainingDigits') + dataArr, labelArr = loadImages('trainingDigits') b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10, kTup) - datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() - svInd = np.nonzero(alphas.A>0)[0] + datMat = np.mat(dataArr) + labelMat = np.mat(labelArr).T + svInd = np.nonzero(alphas.A > 0)[0] sVs=datMat[svInd] - labelSV = labelMat[svInd]; + labelSV = labelMat[svInd] print("支持向量个数:%d" % np.shape(sVs)[0]) m,n = np.shape(datMat) errorCount = 0 for i in range(m): - kernelEval = kernelTrans(sVs,datMat[i,:],kTup) - predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b - if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 - print("训练集错误率: %.2f%%" % (float(errorCount)/m)) - dataArr,labelArr = loadImages('testDigits') + kernelEval = kernelTrans(sVs,datMat[i],kTup) + predict = kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b + errorCount += np.sign(predict) != np.sign(labelArr[i]) + print("训练集错误率: %.2f%%" % (errorCount / m)) + dataArr, labelArr = loadImages('testDigits') errorCount = 0 - datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() - m,n = np.shape(datMat) + datMat = np.mat(dataArr) + m, n = np.shape(datMat) for i in range(m): - kernelEval = kernelTrans(sVs,datMat[i,:],kTup) - predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b - if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 - print("测试集错误率: %.2f%%" % (float(errorCount)/m)) + kernelEval = kernelTrans(sVs,datMat[i], kTup) + predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b + errorCount += np.sign(predict) != np.sign(labelArr[i]) + print("测试集错误率: %.2f%%" % (errorCount / m)) if __name__ == '__main__': testDigits() \ No newline at end of file diff --git a/SVM/svm-simple.py b/SVM/svm-simple.py index 633d3ad..7e4d92e 100644 --- a/SVM/svm-simple.py +++ b/SVM/svm-simple.py @@ -21,13 +21,14 @@ 2017-09-21 """ def loadDataSet(fileName): - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): #逐行读取,滤除空格等 lineArr = line.strip().split('\t') dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据 labelMat.append(float(lineArr[2])) #添加标签 - return dataMat,labelMat + return dataMat, labelMat """ @@ -49,8 +50,8 @@ def loadDataSet(fileName): """ def selectJrand(i, m): j = i #选择一个不等于i的j - while (j == i): - j = int(random.uniform(0, m)) + while j == i: + j = random.randint(0, m - 1) return j """ @@ -69,13 +70,15 @@ def selectJrand(i, m): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use max and min to simplify if. 2017-09-21 """ -def clipAlpha(aj,H,L): - if aj > H: - aj = H - if L > aj: - aj = L +def clipAlpha(aj, H, L): + aj = min(aj, H) + aj = max(L, aj) return aj """ @@ -93,6 +96,10 @@ def clipAlpha(aj,H,L): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Remove code no useful. 2017-09-21 """ def showDataSet(dataMat, labelMat): @@ -103,10 +110,8 @@ def showDataSet(dataMat, labelMat): data_plus.append(dataMat[i]) else: data_minus.append(dataMat[i]) - data_plus_np = np.array(data_plus) #转换为numpy矩阵 - data_minus_np = np.array(data_minus) #转换为numpy矩阵 - plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图 - plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图 + plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1]) #正样本散点图 + plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1]) #负样本散点图 plt.show() @@ -128,33 +133,40 @@ def showDataSet(dataMat, labelMat): Zhihu: https://www.zhihu.com/people/Jack--Cui/ Modify: + 2017-11-15 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use (a - b)**2 instead of a**2 + b**2 + 2 * a *b. 2017-09-23 """ def smoSimple(dataMatIn, classLabels, C, toler, maxIter): #转换为numpy的mat存储 - dataMatrix = np.mat(dataMatIn); labelMat = np.mat(classLabels).transpose() + dataMatrix = np.mat(dataMatIn) + labelMat = np.mat(classLabels).T #初始化b参数,统计dataMatrix的维度 - b = 0; m,n = np.shape(dataMatrix) + b = 0 + m,n = np.shape(dataMatrix) #初始化alpha参数,设为0 - alphas = np.mat(np.zeros((m,1))) + alphas = np.mat(np.zeros((m, 1))) #初始化迭代次数 iter_num = 0 #最多迭代matIter次 - while (iter_num < maxIter): + while iter_num < maxIter: alphaPairsChanged = 0 for i in range(m): #步骤1:计算误差Ei - fXi = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b - Ei = fXi - float(labelMat[i]) + fXi = np.multiply(alphas, labelMat).T * (dataMatrix * dataMatrix[i].T) + b + Ei = fXi - labelMat[i] #优化alpha,设定一定的容错率。 if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)): #随机选择另一个与alpha_i成对优化的alpha_j - j = selectJrand(i,m) + j = selectJrand(i, m) #步骤1:计算误差Ej - fXj = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b - Ej = fXj - float(labelMat[j]) + fXj = np.multiply(alphas,labelMat).T * (dataMatrix*dataMatrix[j].T) + b + Ej = fXj - labelMat[j] #保存更新前的aplpha值,使用深拷贝 - alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy(); + alphaIold = alphas[i].copy() + alphaJold = alphas[j].copy(); #步骤2:计算上下界L和H if (labelMat[i] != labelMat[j]): L = max(0, alphas[j] - alphas[i]) @@ -162,31 +174,42 @@ def smoSimple(dataMatIn, classLabels, C, toler, maxIter): else: L = max(0, alphas[j] + alphas[i] - C) H = min(C, alphas[j] + alphas[i]) - if L==H: print("L==H"); continue + if L==H: + print("L==H") + continue #步骤3:计算eta - eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T - if eta >= 0: print("eta>=0"); continue + eta = dataMatrix[i] - dataMatrix[j] + eta = - eta * eta.T + if eta >= 0: + print("eta>=0") + continue #步骤4:更新alpha_j - alphas[j] -= labelMat[j]*(Ei - Ej)/eta + alphas[j] -= labelMat[j] * (Ei - Ej) / eta #步骤5:修剪alpha_j - alphas[j] = clipAlpha(alphas[j],H,L) - if (abs(alphas[j] - alphaJold) < 0.00001): print("alpha_j变化太小"); continue + alphas[j] = clipAlpha(alphas[j], H, L) + if (abs(alphas[j] - alphaJold) < 0.00001): + print("alpha_j变化太小") + continue #步骤6:更新alpha_i - alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j]) + alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j]) #步骤7:更新b_1和b_2 - b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T - b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T + b1 = b - Ei- labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i] * dataMatrix[i].T \ + - labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[i] * dataMatrix[j].T + b2 = b - Ej- labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i] * dataMatrix[j].T \ + - labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[j] * dataMatrix[j].T #步骤8:根据b_1和b_2更新b - if (0 < alphas[i]) and (C > alphas[i]): b = b1 - elif (0 < alphas[j]) and (C > alphas[j]): b = b2 - else: b = (b1 + b2)/2.0 + if (0 < alphas[i]) and (C > alphas[i]): + b = b1 + elif (0 < alphas[j]) and (C > alphas[j]): + b = b2 + else: + b = (b1 + b2) / 2 #统计优化次数 alphaPairsChanged += 1 #打印统计信息 print("第%d次迭代 样本:%d, alpha优化次数:%d" % (iter_num,i,alphaPairsChanged)) #更新迭代次数 - if (alphaPairsChanged == 0): iter_num += 1 - else: iter_num = 0 + iter_num = iter_num + 1 if not alphaPairsChanged else 0 print("迭代次数: %d" % iter_num) return b,alphas @@ -228,7 +251,7 @@ def showClassifer(dataMat, w, b): b = float(b) a1 = float(a1[0]) a2 = float(a2[0]) - y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2 + y1, y2 = (-b- a1 * x1) / a2, (-b - a1 * x2) / a2 plt.plot([x1, x2], [y1, y2]) #找出支持向量点 for i, alpha in enumerate(alphas): @@ -258,7 +281,7 @@ def showClassifer(dataMat, w, b): """ def get_w(dataMat, labelMat, alphas): alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat) - w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas) + w = np.dot((labelMat.reshape(-1, 1) * dataMat).T, alphas) return w.tolist() diff --git a/SVM/svm-smo.py b/SVM/svm-smo.py index 973356a..1980b48 100644 --- a/SVM/svm-smo.py +++ b/SVM/svm-smo.py @@ -42,7 +42,8 @@ def loadDataSet(fileName): dataMat - 数据矩阵 labelMat - 数据标签 """ - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): #逐行读取,滤除空格等 lineArr = line.strip().split('\t') @@ -59,7 +60,7 @@ def calcEk(oS, k): Returns: Ek - 标号为k的数据误差 """ - fXk = float(np.multiply(oS.alphas,oS.labelMat).T*(oS.X*oS.X[k,:].T) + oS.b) + fXk = np.multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k].T) + oS.b Ek = fXk - float(oS.labelMat[k]) return Ek @@ -74,8 +75,8 @@ def selectJrand(i, m): j - alpha_j的索引值 """ j = i #选择一个不等于i的j - while (j == i): - j = int(random.uniform(0, m)) + while j == i: + j = random.randint(0, m - 1) return j def selectJ(i, oS, Ei): @@ -89,16 +90,21 @@ def selectJ(i, oS, Ei): j, maxK - 标号为j或maxK的数据的索引值 Ej - 标号为j的数据误差 """ - maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 - oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 - validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 - if (len(validEcacheList)) > 1: #有不为0的误差 + maxK = -1 + maxDeltaE = 0 + Ej = 0 #初始化 + oS.eCache[i] = [1, Ei] #根据Ei更新误差缓存 + validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] #返回误差不为0的数据的索引值 + if len(validEcacheList) > 1: #有不为0的误差 for k in validEcacheList: #遍历,找到最大的Ek - if k == i: continue #不计算i,浪费时间 + if k == i: + continue #不计算i,浪费时间 Ek = calcEk(oS, k) #计算Ek deltaE = abs(Ei - Ek) #计算|Ei-Ek| - if (deltaE > maxDeltaE): #找到maxDeltaE - maxK = k; maxDeltaE = deltaE; Ej = Ek + if deltaE > maxDeltaE: #找到maxDeltaE + maxK = k + maxDeltaE = deltaE + Ej = Ek return maxK, Ej #返回maxK,Ej else: #没有不为0的误差 j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 @@ -115,10 +121,10 @@ def updateEk(oS, k): 无 """ Ek = calcEk(oS, k) #计算Ek - oS.eCache[k] = [1,Ek] #更新误差缓存 + oS.eCache[k] = [1, Ek] #更新误差缓存 -def clipAlpha(aj,H,L): +def clipAlpha(aj, H, L): """ 修剪alpha_j Parameters: @@ -128,10 +134,8 @@ def clipAlpha(aj,H,L): Returns: aj - 修剪后的alpah_j的值 """ - if aj > H: - aj = H - if L > aj: - aj = L + aj = min(aj, H) + aj = max(L, aj) return aj def innerL(i, oS): @@ -149,11 +153,12 @@ def innerL(i, oS): #优化alpha,设定一定的容错率。 if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): #使用内循环启发方式2选择alpha_j,并计算Ej - j,Ej = selectJ(i, oS, Ei) + j, Ej = selectJ(i, oS, Ei) #保存更新前的aplpha值,使用深拷贝 - alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); + alphaIold = oS.alphas[i].copy() + alphaJold = oS.alphas[j].copy() #步骤2:计算上下界L和H - if (oS.labelMat[i] != oS.labelMat[j]): + if oS.labelMat[i] != oS.labelMat[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: @@ -163,30 +168,36 @@ def innerL(i, oS): print("L==H") return 0 #步骤3:计算eta - eta = 2.0 * oS.X[i,:] * oS.X[j,:].T - oS.X[i,:] * oS.X[i,:].T - oS.X[j,:] * oS.X[j,:].T - if eta >= 0: + eta = oS.X[i] - oS.X[j] + eta = - eta * eta.T + if eta >= 0: print("eta>=0") return 0 #步骤4:更新alpha_j - oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta + oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta #步骤5:修剪alpha_j - oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) + oS.alphas[j] = clipAlpha(oS.alphas[j], H, L) #更新Ej至误差缓存 updateEk(oS, j) - if (abs(oS.alphas[j] - alphaJold) < 0.00001): + if abs(oS.alphas[j] - alphaJold) < 0.00001: print("alpha_j变化太小") return 0 #步骤6:更新alpha_i - oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) + oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) #更新Ei至误差缓存 updateEk(oS, i) #步骤7:更新b_1和b_2 - b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T - b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T + b1 = oS.b - Ei- oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i] * oS.X[i].T \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[i] * oS.X[j].T + b2 = oS.b - Ej- oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i] * oS.X[j].T \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[j] * oS.X[j].T #步骤8:根据b_1和b_2更新b - if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 - elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 - else: oS.b = (b1 + b2)/2.0 + if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): + oS.b = b1 + elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): + oS.b = b2 + else: + oS.b = (b1 + b2) / 2 return 1 else: return 0 @@ -206,26 +217,22 @@ def smoP(dataMatIn, classLabels, C, toler, maxIter): """ oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler) #初始化数据结构 iter = 0 #初始化当前迭代次数 - entireSet = True; alphaPairsChanged = 0 + entireSet = True + alphaPairsChanged = 0 while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 alphaPairsChanged = 0 - if entireSet: #遍历整个数据集 - for i in range(oS.m): - alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 - print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + if entireSet: #遍历整个数据集 + alphaPairsChanged += sum(innerL(i, oS) for i in range(oS.m)) else: #遍历非边界值 nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha - for i in nonBoundIs: - alphaPairsChanged += innerL(i,oS) - print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + alphaPairsChanged += sum(innerL(i, oS) for i in nonBoundIs) + iter += 1 if entireSet: #遍历一次后改为非边界遍历 entireSet = False - elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 + elif alphaPairsChanged == 0: #如果alpha没有更新,计算全样本遍历 entireSet = True print("迭代次数: %d" % iter) - return oS.b,oS.alphas #返回SMO算法计算的b和alphas + return oS.b, oS.alphas #返回SMO算法计算的b和alphas def showClassifer(dataMat, classLabels, w, b): @@ -246,10 +253,8 @@ def showClassifer(dataMat, classLabels, w, b): data_plus.append(dataMat[i]) else: data_minus.append(dataMat[i]) - data_plus_np = np.array(data_plus) #转换为numpy矩阵 - data_minus_np = np.array(data_minus) #转换为numpy矩阵 - plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1], s=30, alpha=0.7) #正样本散点图 - plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1], s=30, alpha=0.7) #负样本散点图 + plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1], s=30, alpha=0.7) #正样本散点图 + plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1], s=30, alpha=0.7) #负样本散点图 #绘制直线 x1 = max(dataMat)[0] x2 = min(dataMat)[0] @@ -257,7 +262,7 @@ def showClassifer(dataMat, classLabels, w, b): b = float(b) a1 = float(a1[0]) a2 = float(a2[0]) - y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2 + y1, y2 = (-b- a1 * x1) / a2, (-b - a1 * x2) / a2 plt.plot([x1, x2], [y1, y2]) #找出支持向量点 for i, alpha in enumerate(alphas): @@ -267,7 +272,7 @@ def showClassifer(dataMat, classLabels, w, b): plt.show() -def calcWs(alphas,dataArr,classLabels): +def calcWs(alphas, dataArr, classLabels): """ 计算w Parameters: @@ -277,11 +282,12 @@ def calcWs(alphas,dataArr,classLabels): Returns: w - 计算得到的w """ - X = np.mat(dataArr); labelMat = np.mat(classLabels).transpose() + X = np.mat(dataArr) + labelMat = np.mat(classLabels).T m,n = np.shape(X) w = np.zeros((n,1)) for i in range(m): - w += np.multiply(alphas[i]*labelMat[i],X[i,:].T) + w += np.multiply(alphas[i] * labelMat[i], X[i].T) return w if __name__ == '__main__': diff --git a/SVM/svm-svc.py b/SVM/svm-svc.py index 2db1673..8a75207 100644 --- a/SVM/svm-svc.py +++ b/SVM/svm-svc.py @@ -33,7 +33,7 @@ def img2vector(filename): lineStr = fr.readline() #每一行的前32个元素依次添加到returnVect中 for j in range(32): - returnVect[0, 32*i+j] = int(lineStr[j]) + returnVect[0, 32 * i + j] = int(lineStr[j]) #返回转换后的1x1024向量 return returnVect @@ -62,13 +62,13 @@ def handwritingClassTest(): #将获得的类别添加到hwLabels中 hwLabels.append(classNumber) #将每一个文件的1x1024数据存储到trainingMat矩阵中 - trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) - clf = SVC(C=200,kernel='rbf') + trainingMat[i] = img2vector('trainingDigits/%s' % (fileNameStr)) + clf = SVC(C=200, kernel='rbf') clf.fit(trainingMat,hwLabels) #返回testDigits目录下的文件列表 testFileList = listdir('testDigits') #错误检测计数 - errorCount = 0.0 + errorCount = 0 #测试数据的数量 mTest = len(testFileList) #从文件中解析出测试集的类别并进行分类测试 @@ -83,8 +83,7 @@ def handwritingClassTest(): # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) classifierResult = clf.predict(vectorUnderTest) print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) - if(classifierResult != classNumber): - errorCount += 1.0 + errorCount += classifierResult != classNumber print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) if __name__ == '__main__': diff --git a/SVM/svmMLiA.py b/SVM/svmMLiA.py index 82cba1d..38c5d5a 100644 --- a/SVM/svmMLiA.py +++ b/SVM/svmMLiA.py @@ -30,12 +30,12 @@ def __init__(self, dataMatIn, classLabels, C, toler, kTup): self.C = C #松弛变量 self.tol = toler #容错率 self.m = np.shape(dataMatIn)[0] #数据矩阵行数 - self.alphas = np.mat(np.zeros((self.m,1))) #根据矩阵行数初始化alpha参数为0 + self.alphas = np.mat(np.zeros((self.m, 1))) #根据矩阵行数初始化alpha参数为0 self.b = 0 #初始化b参数为0 - self.eCache = np.mat(np.zeros((self.m,2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 - self.K = np.mat(np.zeros((self.m,self.m))) #初始化核K + self.eCache = np.mat(np.zeros((self.m, 2))) #根据矩阵行数初始化虎误差缓存,第一列为是否有效的标志位,第二列为实际的误差E的值。 + self.K = np.mat(np.zeros((self.m, self.m))) #初始化核K for i in range(self.m): #计算所有数据的核K - self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup) + self.K[:,i] = kernelTrans(self.X, self.X[i], kTup) def kernelTrans(X, A, kTup): """ @@ -48,14 +48,16 @@ def kernelTrans(X, A, kTup): K - 计算的核K """ m,n = np.shape(X) - K = np.mat(np.zeros((m,1))) - if kTup[0] == 'lin': K = X * A.T #线性核函数,只进行内积。 + K = np.mat(np.zeros((m, 1))) + if kTup[0] == 'lin': + K = X * A.T #线性核函数,只进行内积。 elif kTup[0] == 'rbf': #高斯核函数,根据高斯核函数公式进行计算 for j in range(m): - deltaRow = X[j,:] - A - K[j] = deltaRow*deltaRow.T - K = np.exp(K/(-1*kTup[1]**2)) #计算高斯核K - else: raise NameError('核函数无法识别') + deltaRow = X[j] - A + K[j] = deltaRow * deltaRow.T + K = np.exp(- K / kTup[1]**2) #计算高斯核K + else: + raise NameError('核函数无法识别') return K #返回计算的核K def loadDataSet(fileName): @@ -67,7 +69,8 @@ def loadDataSet(fileName): dataMat - 数据矩阵 labelMat - 数据标签 """ - dataMat = []; labelMat = [] + dataMat = [] + labelMat = [] fr = open(fileName) for line in fr.readlines(): #逐行读取,滤除空格等 lineArr = line.strip().split('\t') @@ -84,7 +87,7 @@ def calcEk(oS, k): Returns: Ek - 标号为k的数据误差 """ - fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) + fXk = np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b Ek = fXk - float(oS.labelMat[k]) return Ek @@ -99,8 +102,8 @@ def selectJrand(i, m): j - alpha_j的索引值 """ j = i #选择一个不等于i的j - while (j == i): - j = int(random.uniform(0, m)) + while j == i: + j = random.randint(0, m - 1) return j def selectJ(i, oS, Ei): @@ -114,16 +117,21 @@ def selectJ(i, oS, Ei): j, maxK - 标号为j或maxK的数据的索引值 Ej - 标号为j的数据误差 """ - maxK = -1; maxDeltaE = 0; Ej = 0 #初始化 - oS.eCache[i] = [1,Ei] #根据Ei更新误差缓存 - validEcacheList = np.nonzero(oS.eCache[:,0].A)[0] #返回误差不为0的数据的索引值 + maxK = -1 + maxDeltaE = 0 + Ej = 0 #初始化 + oS.eCache[i] = [1, Ei] #根据Ei更新误差缓存 + validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] #返回误差不为0的数据的索引值 if (len(validEcacheList)) > 1: #有不为0的误差 for k in validEcacheList: #遍历,找到最大的Ek - if k == i: continue #不计算i,浪费时间 + if k == i: + continue #不计算i,浪费时间 Ek = calcEk(oS, k) #计算Ek deltaE = abs(Ei - Ek) #计算|Ei-Ek| - if (deltaE > maxDeltaE): #找到maxDeltaE - maxK = k; maxDeltaE = deltaE; Ej = Ek + if deltaE > maxDeltaE: #找到maxDeltaE + maxK = k + maxDeltaE = deltaE + Ej = Ek return maxK, Ej #返回maxK,Ej else: #没有不为0的误差 j = selectJrand(i, oS.m) #随机选择alpha_j的索引值 @@ -140,7 +148,7 @@ def updateEk(oS, k): 无 """ Ek = calcEk(oS, k) #计算Ek - oS.eCache[k] = [1,Ek] #更新误差缓存 + oS.eCache[k] = [1, Ek] #更新误差缓存 def clipAlpha(aj,H,L): @@ -153,10 +161,8 @@ def clipAlpha(aj,H,L): Returns: aj - 修剪后的alpah_j的值 """ - if aj > H: - aj = H - if L > aj: - aj = L + aj = min(aj, H) + aj = max(L, aj) return aj def innerL(i, oS): @@ -174,11 +180,12 @@ def innerL(i, oS): #优化alpha,设定一定的容错率。 if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)): #使用内循环启发方式2选择alpha_j,并计算Ej - j,Ej = selectJ(i, oS, Ei) + j, Ej = selectJ(i, oS, Ei) #保存更新前的aplpha值,使用深拷贝 - alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy(); + alphaIold = oS.alphas[i].copy() + alphaJold = oS.alphas[j].copy() #步骤2:计算上下界L和H - if (oS.labelMat[i] != oS.labelMat[j]): + if oS.labelMat[i] != oS.labelMat[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: @@ -188,35 +195,41 @@ def innerL(i, oS): print("L==H") return 0 #步骤3:计算eta - eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] + eta = oS.X[i] - oS.X[j] + eta = - eta * eta.T if eta >= 0: print("eta>=0") return 0 #步骤4:更新alpha_j - oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta + oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta #步骤5:修剪alpha_j - oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) + oS.alphas[j] = clipAlpha(oS.alphas[j], H, L) #更新Ej至误差缓存 updateEk(oS, j) - if (abs(oS.alphas[j] - alphaJold) < 0.00001): + if abs(oS.alphas[j] - alphaJold) < 0.00001: print("alpha_j变化太小") return 0 #步骤6:更新alpha_i - oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) + oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) #更新Ei至误差缓存 updateEk(oS, i) #步骤7:更新b_1和b_2 - b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j] - b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j] + b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[i, j] + b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] \ + - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[j, j] #步骤8:根据b_1和b_2更新b - if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 - elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 - else: oS.b = (b1 + b2)/2.0 + if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): + oS.b = b1 + elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): + oS.b = b2 + else: + oS.b = (b1 + b2) / 2 return 1 else: return 0 -def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): +def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin', 0)): """ 完整的线性SMO算法 Parameters: @@ -230,31 +243,27 @@ def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin',0)): oS.b - SMO算法计算的b oS.alphas - SMO算法计算的alphas """ - oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler, kTup) #初始化数据结构 + oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).T, C, toler, kTup) #初始化数据结构 iter = 0 #初始化当前迭代次数 - entireSet = True; alphaPairsChanged = 0 + entireSet = True + alphaPairsChanged = 0 while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): #遍历整个数据集都alpha也没有更新或者超过最大迭代次数,则退出循环 alphaPairsChanged = 0 if entireSet: #遍历整个数据集 - for i in range(oS.m): - alphaPairsChanged += innerL(i,oS) #使用优化的SMO算法 - print("全样本遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + alphaPairsChanged += sum(innerL(i, oS) for i in range(oS.m)) else: #遍历非边界值 nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] #遍历不在边界0和C的alpha - for i in nonBoundIs: - alphaPairsChanged += innerL(i,oS) - print("非边界遍历:第%d次迭代 样本:%d, alpha优化次数:%d" % (iter,i,alphaPairsChanged)) - iter += 1 + alphaPairsChanged += sum(innerL(i, oS) for i in nonBoundIs) + iter += 1 if entireSet: #遍历一次后改为非边界遍历 entireSet = False - elif (alphaPairsChanged == 0): #如果alpha没有更新,计算全样本遍历 + elif alphaPairsChanged == 0: #如果alpha没有更新,计算全样本遍历 entireSet = True print("迭代次数: %d" % iter) - return oS.b,oS.alphas #返回SMO算法计算的b和alphas + return oS.b, oS.alphas #返回SMO算法计算的b和alphas -def testRbf(k1 = 1.3): +def testRbf(k1=1.3): """ 测试函数 Parameters: @@ -264,26 +273,26 @@ def testRbf(k1 = 1.3): """ dataArr,labelArr = loadDataSet('testSetRBF.txt') #加载训练集 b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 100, ('rbf', k1)) #根据训练集计算b和alphas - datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() + datMat = np.mat(dataArr); labelMat = np.mat(labelArr).T svInd = np.nonzero(alphas.A > 0)[0] #获得支持向量 sVs = datMat[svInd] - labelSV = labelMat[svInd]; + labelSV = labelMat[svInd] print("支持向量个数:%d" % np.shape(sVs)[0]) - m,n = np.shape(datMat) + m, n = np.shape(datMat) errorCount = 0 for i in range(m): - kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1)) #计算各个点的核 + kernelEval = kernelTrans(sVs,datMat[i],('rbf', k1)) #计算各个点的核 predict = kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b #根据支持向量的点,计算超平面,返回预测结果 - if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 - print("训练集错误率: %.2f%%" % ((float(errorCount)/m)*100)) #打印错误率 - dataArr,labelArr = loadDataSet('testSetRBF2.txt') #加载测试集 + errorCount += np.sign(predict) != np.sign(labelArr[i]) #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 + print("训练集错误率: %.2f%%" % (errorCount / m * 100)) #打印错误率 + dataArr, labelArr = loadDataSet('testSetRBF2.txt') #加载测试集 errorCount = 0 - datMat = np.mat(dataArr); labelMat = np.mat(labelArr).transpose() + datMat = np.mat(dataArr) m,n = np.shape(datMat) for i in range(m): - kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1)) #计算各个点的核 - predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b #根据支持向量的点,计算超平面,返回预测结果 - if np.sign(predict) != np.sign(labelArr[i]): errorCount += 1 #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 + kernelEval = kernelTrans(sVs,datMat[i],('rbf', k1)) #计算各个点的核 + predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b #根据支持向量的点,计算超平面,返回预测结果 + errorCount += np.sign(predict) != np.sign(labelArr[i]) #返回数组中各元素的正负符号,用1和-1表示,并统计错误个数 print("测试集错误率: %.2f%%" % ((float(errorCount)/m)*100)) #打印错误率 @@ -303,10 +312,8 @@ def showDataSet(dataMat, labelMat): data_plus.append(dataMat[i]) else: data_minus.append(dataMat[i]) - data_plus_np = np.array(data_plus) #转换为numpy矩阵 - data_minus_np = np.array(data_minus) #转换为numpy矩阵 - plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图 - plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图 + plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1]) #正样本散点图 + plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1]) #负样本散点图 plt.show() if __name__ == '__main__': diff --git "a/kNN/1.\347\256\200\345\215\225k-NN/kNN_test01.py" "b/kNN/1.\347\256\200\345\215\225k-NN/kNN_test01.py" index 0b8f32e..9d048c7 100644 --- "a/kNN/1.\347\256\200\345\215\225k-NN/kNN_test01.py" +++ "b/kNN/1.\347\256\200\345\215\225k-NN/kNN_test01.py" @@ -33,7 +33,7 @@ def createDataSet(): sortedClassCount[0][0] - 分类结果 Modify: - 2017-11-09 by Cugtyt + 2017-11-14 by Cugtyt * GitHub(https://github.com/Cugtyt) * Email(cugtyt@qq.com) Use list comprehension and Counter to simplify code diff --git "a/kNN/2.\346\265\267\344\274\246\347\272\246\344\274\232/kNN_test02.py" "b/kNN/2.\346\265\267\344\274\246\347\272\246\344\274\232/kNN_test02.py" index d156159..91ef4f5 100644 --- "a/kNN/2.\346\265\267\344\274\246\347\272\246\344\274\232/kNN_test02.py" +++ "b/kNN/2.\346\265\267\344\274\246\347\272\246\344\274\232/kNN_test02.py" @@ -4,7 +4,7 @@ import matplotlib.lines as mlines import matplotlib.pyplot as plt import numpy as np -import operator +import collections """ @@ -19,37 +19,21 @@ sortedClassCount[0][0] - 分类结果 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension, Counter, broadcasting instead of + tile in numpy to simplify code. 2017-03-24 """ def classify0(inX, dataSet, labels, k): - #numpy函数shape[0]返回dataSet的行数 - dataSetSize = dataSet.shape[0] - #在列向量方向上重复inX共1次(横向),行向量方向上重复inX共dataSetSize次(纵向) - diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet - #二维特征相减后平方 - sqDiffMat = diffMat**2 - #sum()所有元素相加,sum(0)列相加,sum(1)行相加 - sqDistances = sqDiffMat.sum(axis=1) - #开方,计算出距离 - distances = sqDistances**0.5 - #返回distances中元素从小到大排序后的索引值 - sortedDistIndices = distances.argsort() - #定一个记录类别次数的字典 - classCount = {} - for i in range(k): - #取出前k个元素的类别 - voteIlabel = labels[sortedDistIndices[i]] - #dict.get(key,default=None),字典的get()方法,返回指定键的值,如果值不在字典中返回默认值。 - #计算类别次数 - classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 - #python3中用items()替换python2中的iteritems() - #key=operator.itemgetter(1)根据字典的值进行排序 - #key=operator.itemgetter(0)根据字典的键进行排序 - #reverse降序排序字典 - sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) - print(sortedClassCount) - #返回次数最多的类别,即所要分类的类别 - return sortedClassCount[0][0] + # 计算距离 + dist = np.sum((inX - dataSet) ** 2, axis=1) ** 0.5 + # k个最近的标签 + k_labels = [labels[index] for index in dist.argsort()[0: k]] + # 出现次数最多的标签即为最终类别 + label = collections.Counter(k_labels).most_common(1)[0][0] + return label """ @@ -62,6 +46,12 @@ def classify0(inX, dataSet, labels, k): classLabelVector - 分类Label向量 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Remove variable not used. + Use dict to simplify if conditions. + Use enumerate to get index in each iteration. 2017-03-24 """ def file2matrix(filename): @@ -70,28 +60,20 @@ def file2matrix(filename): #读取文件所有内容 arrayOLines = fr.readlines() #得到文件行数 - numberOfLines = len(arrayOLines) #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 - returnMat = np.zeros((numberOfLines,3)) + returnMat = np.zeros((len(arrayOLines), 3)) #返回的分类标签向量 classLabelVector = [] - #行的索引值 - index = 0 - for line in arrayOLines: + labeldict = {'didntLike' : 1, 'smallDoses' : 2, 'largeDoses' : 3} + for index, line in enumerate(arrayOLines): #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') line = line.strip() #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 listFromLine = line.split('\t') #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 - returnMat[index,:] = listFromLine[0:3] + returnMat[index] = listFromLine[0 : 3] #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 - if listFromLine[-1] == 'didntLike': - classLabelVector.append(1) - elif listFromLine[-1] == 'smallDoses': - classLabelVector.append(2) - elif listFromLine[-1] == 'largeDoses': - classLabelVector.append(3) - index += 1 + classLabelVector.append(labeldict[listFromLine[-1]]) return returnMat, classLabelVector """ @@ -103,6 +85,10 @@ def file2matrix(filename): Returns: 无 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use dict to simplify if conditions. 2017-03-24 """ def showdatas(datingDataMat, datingLabels): @@ -110,23 +96,17 @@ def showdatas(datingDataMat, datingLabels): font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) #将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8) #当nrow=2,nclos=2时,代表fig画布被分为四个区域,axs[0][0]表示第一行第一个区域 - fig, axs = plt.subplots(nrows=2, ncols=2,sharex=False, sharey=False, figsize=(13,8)) + fig, axs = plt.subplots(nrows=2, ncols=2,sharex=False, sharey=False, figsize=(13, 8)) + + colordict = {1 : 'black', 2 : 'orange', 3 : 'red'} + LabelsColors = [colordict[i] for i in datingLabels] - numberOfLabels = len(datingLabels) - LabelsColors = [] - for i in datingLabels: - if i == 1: - LabelsColors.append('black') - if i == 2: - LabelsColors.append('orange') - if i == 3: - LabelsColors.append('red') #画出散点图,以datingDataMat矩阵的第一(飞行常客例程)、第二列(玩游戏)数据画散点数据,散点大小为15,透明度为0.5 - axs[0][0].scatter(x=datingDataMat[:,0], y=datingDataMat[:,1], color=LabelsColors,s=15, alpha=.5) + axs[0][0].scatter(x=datingDataMat[:, 0], y=datingDataMat[:, 1], color=LabelsColors, s=15, alpha=.5) #设置标题,x轴label,y轴label - axs0_title_text = axs[0][0].set_title(u'每年获得的飞行常客里程数与玩视频游戏所消耗时间占比',FontProperties=font) - axs0_xlabel_text = axs[0][0].set_xlabel(u'每年获得的飞行常客里程数',FontProperties=font) - axs0_ylabel_text = axs[0][0].set_ylabel(u'玩视频游戏所消耗时间占比',FontProperties=font) + axs0_title_text = axs[0][0].set_title(u'每年获得的飞行常客里程数与玩视频游戏所消耗时间占比', FontProperties=font) + axs0_xlabel_text = axs[0][0].set_xlabel(u'每年获得的飞行常客里程数', FontProperties=font) + axs0_ylabel_text = axs[0][0].set_ylabel(u'玩视频游戏所消耗时间占比', FontProperties=font) plt.setp(axs0_title_text, size=9, weight='bold', color='red') plt.setp(axs0_xlabel_text, size=7, weight='bold', color='black') plt.setp(axs0_ylabel_text, size=7, weight='bold', color='black') @@ -134,19 +114,19 @@ def showdatas(datingDataMat, datingLabels): #画出散点图,以datingDataMat矩阵的第一(飞行常客例程)、第三列(冰激凌)数据画散点数据,散点大小为15,透明度为0.5 axs[0][1].scatter(x=datingDataMat[:,0], y=datingDataMat[:,2], color=LabelsColors,s=15, alpha=.5) #设置标题,x轴label,y轴label - axs1_title_text = axs[0][1].set_title(u'每年获得的飞行常客里程数与每周消费的冰激淋公升数',FontProperties=font) - axs1_xlabel_text = axs[0][1].set_xlabel(u'每年获得的飞行常客里程数',FontProperties=font) - axs1_ylabel_text = axs[0][1].set_ylabel(u'每周消费的冰激淋公升数',FontProperties=font) + axs1_title_text = axs[0][1].set_title(u'每年获得的飞行常客里程数与每周消费的冰激淋公升数', FontProperties=font) + axs1_xlabel_text = axs[0][1].set_xlabel(u'每年获得的飞行常客里程数', FontProperties=font) + axs1_ylabel_text = axs[0][1].set_ylabel(u'每周消费的冰激淋公升数', FontProperties=font) plt.setp(axs1_title_text, size=9, weight='bold', color='red') plt.setp(axs1_xlabel_text, size=7, weight='bold', color='black') plt.setp(axs1_ylabel_text, size=7, weight='bold', color='black') #画出散点图,以datingDataMat矩阵的第二(玩游戏)、第三列(冰激凌)数据画散点数据,散点大小为15,透明度为0.5 - axs[1][0].scatter(x=datingDataMat[:,1], y=datingDataMat[:,2], color=LabelsColors,s=15, alpha=.5) + axs[1][0].scatter(x=datingDataMat[:,1], y=datingDataMat[:,2], color=LabelsColors, s=15, alpha=.5) #设置标题,x轴label,y轴label - axs2_title_text = axs[1][0].set_title(u'玩视频游戏所消耗时间占比与每周消费的冰激淋公升数',FontProperties=font) - axs2_xlabel_text = axs[1][0].set_xlabel(u'玩视频游戏所消耗时间占比',FontProperties=font) - axs2_ylabel_text = axs[1][0].set_ylabel(u'每周消费的冰激淋公升数',FontProperties=font) + axs2_title_text = axs[1][0].set_title(u'玩视频游戏所消耗时间占比与每周消费的冰激淋公升数', FontProperties=font) + axs2_xlabel_text = axs[1][0].set_xlabel(u'玩视频游戏所消耗时间占比', FontProperties=font) + axs2_ylabel_text = axs[1][0].set_ylabel(u'每周消费的冰激淋公升数', FontProperties=font) plt.setp(axs2_title_text, size=9, weight='bold', color='red') plt.setp(axs2_xlabel_text, size=7, weight='bold', color='black') plt.setp(axs2_ylabel_text, size=7, weight='bold', color='black') @@ -158,9 +138,9 @@ def showdatas(datingDataMat, datingLabels): largeDoses = mlines.Line2D([], [], color='red', marker='.', markersize=6, label='largeDoses') #添加图例 - axs[0][0].legend(handles=[didntLike,smallDoses,largeDoses]) - axs[0][1].legend(handles=[didntLike,smallDoses,largeDoses]) - axs[1][0].legend(handles=[didntLike,smallDoses,largeDoses]) + axs[0][0].legend(handles=[didntLike, smallDoses, largeDoses]) + axs[0][1].legend(handles=[didntLike, smallDoses, largeDoses]) + axs[1][0].legend(handles=[didntLike, smallDoses, largeDoses]) #显示图片 plt.show() @@ -176,6 +156,11 @@ def showdatas(datingDataMat, datingLabels): minVals - 数据最小值 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use broadcasting instead of tile for heavy compution cost. + make the code more readable. 2017-03-24 """ def autoNorm(dataSet): @@ -184,14 +169,7 @@ def autoNorm(dataSet): maxVals = dataSet.max(0) #最大值和最小值的范围 ranges = maxVals - minVals - #shape(dataSet)返回dataSet的矩阵行列数 - normDataSet = np.zeros(np.shape(dataSet)) - #返回dataSet的行数 - m = dataSet.shape[0] - #原始值减去最小值 - normDataSet = dataSet - np.tile(minVals, (m, 1)) - #除以最大和最小值的差,得到归一化数据 - normDataSet = normDataSet / np.tile(ranges, (m, 1)) + normDataSet = (dataSet - minVals) / ranges #返回归一化数据结果,数据范围,最小值 return normDataSet, ranges, minVals @@ -207,6 +185,10 @@ def autoNorm(dataSet): minVals - 数据最小值 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify if condition. 2017-03-24 """ def datingClassTest(): @@ -227,12 +209,11 @@ def datingClassTest(): for i in range(numTestVecs): #前numTestVecs个数据作为测试集,后m-numTestVecs个数据作为训练集 - classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], - datingLabels[numTestVecs:m], 4) + classifierResult = classify0(normMat[i], normMat[numTestVecs : m], + datingLabels[numTestVecs : m], 4) print("分类结果:%s\t真实类别:%d" % (classifierResult, datingLabels[i])) - if classifierResult != datingLabels[i]: - errorCount += 1.0 - print("错误率:%f%%" %(errorCount/float(numTestVecs)*100)) + errorCount += classifierResult != datingLabels[i] + print("错误率:%f%%" % (errorCount / float(numTestVecs) * 100)) """ 函数说明:通过输入一个人的三维特征,进行分类输出 diff --git "a/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test03.py" "b/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test03.py" index e902491..571641b 100644 --- "a/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test03.py" +++ "b/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test03.py" @@ -1,6 +1,6 @@ # -*- coding: UTF-8 -*- import numpy as np -import operator +import collections from os import listdir """ @@ -15,36 +15,21 @@ sortedClassCount[0][0] - 分类结果 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Use list comprehension, Counter, broadcasting instead of + tile in numpy to simplify code. 2017-03-25 """ def classify0(inX, dataSet, labels, k): - #numpy函数shape[0]返回dataSet的行数 - dataSetSize = dataSet.shape[0] - #在列向量方向上重复inX共1次(横向),行向量方向上重复inX共dataSetSize次(纵向) - diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet - #二维特征相减后平方 - sqDiffMat = diffMat**2 - #sum()所有元素相加,sum(0)列相加,sum(1)行相加 - sqDistances = sqDiffMat.sum(axis=1) - #开方,计算出距离 - distances = sqDistances**0.5 - #返回distances中元素从小到大排序后的索引值 - sortedDistIndices = distances.argsort() - #定一个记录类别次数的字典 - classCount = {} - for i in range(k): - #取出前k个元素的类别 - voteIlabel = labels[sortedDistIndices[i]] - #dict.get(key,default=None),字典的get()方法,返回指定键的值,如果值不在字典中返回默认值。 - #计算类别次数 - classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 - #python3中用items()替换python2中的iteritems() - #key=operator.itemgetter(1)根据字典的值进行排序 - #key=operator.itemgetter(0)根据字典的键进行排序 - #reverse降序排序字典 - sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) - #返回次数最多的类别,即所要分类的类别 - return sortedClassCount[0][0] + # 计算距离 + dist = np.sum((inX - dataSet) ** 2, axis=1) ** 0.5 + # k个最近的标签 + k_labels = [labels[index] for index in dist.argsort()[0: k]] + # 出现次数最多的标签即为最终类别 + label = collections.Counter(k_labels).most_common(1)[0][0] + return label """ 函数说明:将32x32的二进制图像转换为1x1024向量。 @@ -68,7 +53,7 @@ def img2vector(filename): lineStr = fr.readline() #每一行的前32个元素依次添加到returnVect中 for j in range(32): - returnVect[0, 32*i+j] = int(lineStr[j]) + returnVect[0, 32 * i + j] = int(lineStr[j]) #返回转换后的1x1024向量 return returnVect @@ -81,6 +66,11 @@ def img2vector(filename): 无 Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify if condition. + Remove float(), not need in python3 2017-03-25 """ def handwritingClassTest(): @@ -101,11 +91,11 @@ def handwritingClassTest(): #将获得的类别添加到hwLabels中 hwLabels.append(classNumber) #将每一个文件的1x1024数据存储到trainingMat矩阵中 - trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) + trainingMat[i] = img2vector('trainingDigits/%s' % (fileNameStr)) #返回testDigits目录下的文件名 testFileList = listdir('testDigits') #错误检测计数 - errorCount = 0.0 + errorCount = 0 #测试数据的数量 mTest = len(testFileList) #从文件中解析出测试集的类别并进行分类测试 @@ -119,8 +109,7 @@ def handwritingClassTest(): #获得预测结果 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) - if(classifierResult != classNumber): - errorCount += 1.0 + errorCount += classifierResult != classNumber print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest)) diff --git "a/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test04.py" "b/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test04.py" index 7bf2cdb..faadc4a 100644 --- "a/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test04.py" +++ "b/kNN/3.\346\225\260\345\255\227\350\257\206\345\210\253/kNN_test04.py" @@ -39,6 +39,12 @@ def img2vector(filename): 无 Modify: + Modify: + 2017-11-14 by Cugtyt + * GitHub(https://github.com/Cugtyt) + * Email(cugtyt@qq.com) + Simplify if condition. + Remove float(), not need in python3 2017-07-15 """ def handwritingClassTest(): @@ -59,7 +65,7 @@ def handwritingClassTest(): #将获得的类别添加到hwLabels中 hwLabels.append(classNumber) #将每一个文件的1x1024数据存储到trainingMat矩阵中 - trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) + trainingMat[i] = img2vector('trainingDigits/%s' % (fileNameStr)) #构建kNN分类器 neigh = kNN(n_neighbors = 3, algorithm = 'auto') #拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签 @@ -67,7 +73,7 @@ def handwritingClassTest(): #返回testDigits目录下的文件列表 testFileList = listdir('testDigits') #错误检测计数 - errorCount = 0.0 + errorCount = 0 #测试数据的数量 mTest = len(testFileList) #从文件中解析出测试集的类别并进行分类测试 @@ -82,8 +88,7 @@ def handwritingClassTest(): # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) classifierResult = neigh.predict(vectorUnderTest) print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) - if(classifierResult != classNumber): - errorCount += 1.0 + errorCount += classifierResult != classNumber print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))