kNN分类算法C++与Python实现

2015年9月28日 18053点热度 27人点赞 2条评论

[author]博主前一段时间一直在做嵌入式开发学习,苦于一些廉价传感器的差质量，我的项目一直停滞在demo阶段。遇到了较大的瓶颈，我决定暂时放一放，开始直接入手信息处理层。暑期的数学建模中，我也感觉自己对于数据的处理能力还欠缺，遂开始恶补基础。开始《线性代数》《离散数学》《集体编程的智慧》《廖雪峰的python2.7教程》《机器学习实战》《Ensemnle Methods Foundations and Algroithns》并行学习。[/author]

k-Nearest Heighbor

也称为k-近邻算法，是流行的线性学习算法的一种。

工作原理：存在一个样本数据集合，也称作训练样本集，并且样本集中每个数据都存在标签，即我们知道样本集中每一个数据与所属分类的对应关系。输入没有标签的新数据后，将新数据的每个特征与样本集中数据对应的特征进行比较，然后算法提取样本集中特征最相似数据（最近邻）的分类标签。一般来说，我们只选择样本数据及中前ｋ个最相似的数据，这就是k-近邻算法中k的出处，通常k选择不大于20的整数。最后，选择k个最相似数据中出现次数最多的分类，作为新数据的分类。

我的理解：存在一个已知分类的样本集，每个数据都可以被人为的划分到一个标签名录下。现在，我们有一个未知数据，要想知道这个未知数据属于哪个标签下。那么，我们先对已知数据集提取特征，划分为若干类。然后将未知数据与已知数据集做对比，然后选择出最相近的前k个数据，然后选定这k个数据中出现频率次数最高的标签。那么这个未知数据被分类到这个标签名录下。

欢迎前辈指正。

按照《机器学习实战》的内容，编写出如下python程序。

[newtext title='kNN']

from numpy import *
import operator

def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels

def classify0(inX,dataSet,labels,k):
dataSetSize=dataSet.shape[0]
diffMat=tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)
distances=sqDistances**0.5
sortedDistIndicies=distances.argsort()
classCount={}
for i in range(k):
voteIlabel=labels[sortedDistIndicies[i]]
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.iteritems(),
key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0] [0]

[/newtext]

[newtext title='输入如下：']

Python 2.7.9 (default, Dec 10 2014, 12:24:55) [MSC v.1500 32 bit (Intel)] on win32
Type "copyright", "credits" or "license()" for more information.
>>> import kNN
>>> group,labels=kNN.createDataSet()
>>> group
array([[ 1. , 1.1],
[ 1. , 1. ],
[ 0. , 0. ],
[ 0. , 0.1]])
>>> labels
['A', 'A', 'B', 'B']
>>> kNN.classify0([0.5,0.1],group,labels,3)
'B'
>>> kNN.classify0([0.5,0.5],group,labels,2)
'A'
>>> kNN.classify0([0.5,0.5],group,labels,3)
'B'
>>> [/newtext]

后来，博主在找训练数据时偶然发现有人做了c++的kNN实现；

[text]

#include<iostream>
#include<map>
#include<vector>
#include<stdio.h>
#include<cmath>
#include<cstdlib>
#include<algorithm>
#include<fstream>

using namespace std;

typedef char tLabel;
typedef double tData;
typedef pair<int,double> PAIR;
const int colLen = 2;
const int rowLen = 12;
ifstream fin;
ofstream fout;

class KNN
{
private:
tData dataSet[rowLen][colLen];
tLabel labels[rowLen];
tData testData[colLen];
int k;
map<int,double> map_index_dis;
map<tLabel,int> map_label_freq;
double get_distance(tData *d1,tData *d2);
public:

KNN(int k);

void get_all_distance();

void get_max_freq_label();

struct CmpByValue
{
bool operator() (const PAIR& lhs,const PAIR& rhs)
{
return lhs.second < rhs.second;
}
};

};

KNN::KNN(int k)
{
this->k = k;

fin.open("data.txt");

if(!fin)
{
cout<<"can not open the file data.txt"<<endl;
exit(1);
}

/* input the dataSet */
for(int i=0;i<rowLen;i++)
{
for(int j=0;j<colLen;j++)
{
fin>>dataSet[i][j];
}
fin>>labels[i];
}

cout<<"please input the test data :"<<endl;
/* inuput the test data */
for(int i=0;i<colLen;i++)
cin>>testData[i];

}

/*
* calculate the distance between test data and dataSet[i]
*/
double KNN:: get_distance(tData *d1,tData *d2)
{
double sum = 0;
for(int i=0;i<colLen;i++)
{
sum += pow( (d1[i]-d2[i]) , 2 );
}

// cout<<"the sum is = "<<sum<<endl;
return sqrt(sum);
}

/*
* calculate all the distance between test data and each training data
*/
void KNN:: get_all_distance()
{
double distance;
int i;
for(i=0;i<rowLen;i++)
{
distance = get_distance(dataSet[i],testData);
//<key,value> => <i,distance>
map_index_dis[i] = distance;
}

//traverse the map to print the index and distance
map<int,double>::const_iterator it = map_index_dis.begin();
while(it!=map_index_dis.end())
{
cout<<"index = "<<it->first<<" distance = "<<it->second<<endl;
it++;
}
}

/*
* check which label the test data belongs to to classify the test data
*/
void KNN:: get_max_freq_label()
{
//transform the map_index_dis to vec_index_dis
vector<PAIR> vec_index_dis( map_index_dis.begin(),map_index_dis.end() );
//sort the vec_index_dis by distance from low to high to get the nearest data
sort(vec_index_dis.begin(),vec_index_dis.end(),CmpByValue());

for(int i=0;i<k;i++)
{
cout<<"the index = "<<vec_index_dis[i].first<<" the distance = "<<vec_index_dis[i].second<<" the label = "<<labels[vec_index_dis[i].first]<<" the coordinate ( "<<dataSet[ vec_index_dis[i].first ][0]<<","<<dataSet[ vec_index_dis[i].first ][1]<<" )"<<endl;
//calculate the count of each label
map_label_freq[ labels[ vec_index_dis[i].first ] ]++;
}

map<tLabel,int>::const_iterator map_it = map_label_freq.begin();
tLabel label;
int max_freq = 0;
//find the most frequent label
while( map_it != map_label_freq.end() )
{
if( map_it->second > max_freq )
{
max_freq = map_it->second;
label = map_it->first;
}
map_it++;
}
cout<<"The test data belongs to the "<<label<<" label"<<endl;
}

int main()
{
int k ;
cout<<"please input the k value : "<<endl;
cin>>k;
KNN knn(k);
knn.get_all_distance();
knn.get_max_freq_label();
system("pause");
return 0;
}[/text]

这里在介绍一个machine learning data的网站

kNN分类算法C++与Python实现

文章评论