# Python3入门机器学习 - k近邻算法

kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别，则该样本也属于这个类别，并具有这个类别上样本的特性。

``import numpy as npimport matplotlib.pyplot as pltx_train = np.random.rand(10,2)*8y_train = np.random.randint(0,2,10)x = np.array([3,4])k=3plt.scatter(x_train[y_train==1,0],x_train[y_train==1,1],color="red")plt.scatter(x_train[y_train==0,0],x_train[y_train==0,1],color="green")plt.scatter(x[0],x[1],marker='+',color="blue")plt.show()``

``X_train = np.array(x_train)Y_train = np.array(y_train)from math import sqrtdistances = []for x_train in X_train:    d = sqrt(np.sum((x-x_train)**2))    distances.append(d)distances = [sqrt(np.sum((x-x_train)**2)) for x_train in X_train]argindex = np.argsort(distances)from collections import CountertopK_Y = [Y_train[i] for i in argindex[:k]]votes = Counter(topK_Y)votes.most_common(1)[0][0]``

``from sklearn.neighbors import KNeighborsClassifierknn_clf = KNeighborsClassifier(n_neighbors=3)knn_clf.fit(X_train,Y_train)knn_clf.predict(x.reshape(1,-1))[0]``

``# _*_ encoding:utf-8 _*_import numpy as npfrom math import sqrtfrom collections import Counterclass KNNClassifier:    def __init__(self,k):        assert k>=1, "k must be valid"        self.k = k        self._X_train = None        self._Y_train = None    def fit(self,X_train,Y_train):        assert X_train.shape[0] == Y_train.shape[0],/                                                     "The size of X_train must be equals to the size of Y-Train"        assert self.k <= X_train.shape[0]        self._X_train = X_train        self._Y_train = Y_train        return self    def predict(self,x_predict):        return np.array([self._predict(x) for x in x_predict])    def _predict(self,x):        distances = [ sqrt(np.sum((x_train-x)**2)) for x_train in self._X_train]        nearest = np.argsort(distances)        votes = [i for i in self._Y_train[nearest[:self.k]]]        return Counter(votes).most_common(1)[0][0]    def __repr__(self):        return "knn(k=%d)" %self.k``

``# _*_ encoding:utf-8 _*_import numpy as npdef train_test_split(X,y,test_radio=0.2,seed=None):    assert X.shape[0]==y.shape[0],"The size of X and y must be equal"    assert 0.0<=test_radio<=1.0,"test radio must be valid"    if(seed):        np.random.seed(seed)    shuffled_indexes = np.random.permutation(len(X))    test_size = int(X.shape[0]*test_radio)    test_indexes = shuffled_indexes[:test_size]    train_indexes = shuffled_indexes[test_size:]    X_test = X[test_indexes]    y_test = y[test_indexes]    X_train = X[train_indexes]    y_train = y[train_indexes]    return X_train,X_test,y_train,y_test``

``import numpy as npfrom sklearn import datasetsimport matplotlib.pyplot as pltimport matplotlib%run MyScripts/KNN.py%run MyScripts/metrics.py%run MyScripts/model_selection.pydigits = datasets.load_digits()X = digits.datay = digits.target``

``some_digit = X[666]some_digit_image = some_digit.reshape(8,8)plt.imshow(some_digit_image,cmap=matplotlib.cm.binary)``

``knn_clf = KNNClassifier(k=6)X_train,X_test,y_train,y_test = train_test_split(X,y)knn_clf.fit(X_train,y_train)knn_clf.score(X_test,y_test)``

``//使用网格搜索法确定weights和k超参数best_k = -1best_score = -1methods = ["uniform","distance"]best_method = ""for method in methods:    for k in range(1,11):        knn_clf = KNeighborsClassifier(n_neighbors=k,weights=method)        knn_clf.fit(X_train,y_train)        score = knn_clf.score(X_test,y_test)        if(score>best_score):            best_k = k            best_score = score            best_method = methodprint("best_k = ",best_k)print("best_score = ",best_score)print("best_method = ",best_method)``

best_k = 3
best_score = 0.9888888888888889
best_method = uniform

``best_k = -1best_score = -1best_p=-1for p in range(1,6):    for k in range(1,11):        knn_clf = KNeighborsClassifier(n_neighbors=k,weights="distance",p=p)        knn_clf.fit(X_train,y_train)        score = knn_clf.score(X_test,y_test)        if(score>best_score):            best_k = k            best_score = score            best_p = pprint("best_k = ",best_k)print("best_score = ",best_score)print("best_p = ",best_p)``

best_k = 3
best_score = 0.9888888888888889
best_p = 2