【CUDA并行编程之八】Cuda实现Kmeans算法-蒲公英云

本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法，Kmeans算法的详细介绍在这里，本文重点在并行实现的过程。

当然还是简单的回顾一下kmeans算法的串行过程：

伪代码：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

创建k个点作为起始质心(经常是随机选择)
当任意一个点的簇分配结果发生改变时
对数据集中的每个数据点
对每个质心
计算质心与数据点之间的距离
将数据点分配到距其最近的簇
对每一个簇，计算簇中所有点的均值并将均值作为质心

我们可以观察到有两个部分可以并行优化：

①line03-04：将每个数据点到多个质心的距离计算进行并行化

②line05：将数据点到某个执行的距离计算进行并行化

KMEANS类：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

class KMEANS
{
private:
int numClusters;
int numCoords;
int numObjs;
int *membership;//[numObjs]
char *filename;
float **objects;//[numObjs][numCoords] data objects
float **clusters;//[numClusters][unmCoords] cluster center
float threshold;
int loop_iterations;
public:
KMEANS(int k);
void file_read(char *fn);
void file_write();
void cuda_kmeans();
inline int nextPowerOfTwo(int n);
void free_memory();
virtual ~KMEANS();
};//KMEANS

成员变量：

numClusters：中心点的个数

numCoords：每个数据点的维度

numObjs：数据点的个数

membership：每个数据点所属类别的数组，维度为numObjs

filename：读入的文件名

objects：所有数据点，维度为[numObjs][numCoords]

clusters：中心点数据，维度为[numObjs][numCoords]

threshold：控制循环次数的一个域值

loop_iterations：循环的迭代次数

成员函数：

KMEANS(int k)：含参构造函数。初始化成员变量

file_read(char *fn)：读入文件数据并初始化object以及membership变量

file_write()：将计算结果写回到结果文件中去

cuda_kmeans()：kmeans计算的入口函数

nextPowerOfTwo(int n)：它计算大于等于输入参数n的第一个2的幂次数。

free_memory()：释放内存空间

~KMEANS()：析构函数

并行的代码主要三个函数：

find_nearest_cluster(…)

compute_delta(…)

euclid_dist_2(…)

首先看一下函数euclid_dist_2(…)：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

__host__ __device__ inline static
float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId)
{
int i;
float ans = 0;
for( i=0;i<numCoords;i++ )
{
ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) *
( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ;
}
return ans;
}

这段代码实际上就是并行的计算向量objects[objectId]和clusters[clusterId]之间的距离，即第objectId个数据点到第clusterId个中心点的距离。

再看一下函数compute_delta(…)：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

/*
* numIntermediates:The actual number of intermediates
* numIntermediates2:The next power of two
*/
__global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2)
{
extern __shared__ unsigned int intermediates[];
intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ;
__syncthreads();
//numIntermediates2 *must* be a power of two!
for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1)
{
if(threadIdx.x < s)
{
intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
deviceIntermediates[0] = intermediates[0];
}
}

这段代码的意义就是将一个线程块中每个线程的对应的intermediates的数据求和最后放到deviceIntermediates[0]中去然后拷贝回主存块中去。这个问题的更好的解释在这里，实际上就是一个数组求和的问题，应用在这里求得的是有改变的membership中所有数据的和，即改变了簇的点的个数。

最后再看函数finid_nearest_cluster(…)：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

/*
* objects:[numCoords][numObjs]
* deviceClusters:[numCoords][numClusters]
* membership:[numObjs]
*/
__global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates)
{
extern __shared__ char sharedMemory[];
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
float *clusters = deviceClusters;
membershipChanged[threadIdx.x] = 0;
int objectId = blockDim.x * blockIdx.x + threadIdx.x;
if( objectId < numObjs )
{
int index;
float dist,min_dist;
/*find the cluster id that has min distance to object*/
index = 0;
min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
for(int i=0;i<numClusters;i++)
{
dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ;
/* no need square root */
if( dist < min_dist )
{
min_dist = dist;
index = i;
}
}
if( membership[objectId]!=index )
{
membershipChanged[threadIdx.x] = 1;
}
//assign the membership to object objectId
membership[objectId] = index;
__syncthreads(); //for membershipChanged[]
#if 1
//blockDim.x *must* be a power of two!
for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1)
{
if(threadIdx.x < s)
{
membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0]
}
__syncthreads();
}
if(threadIdx.x == 0)
{
intermediates[blockIdx.x] = membershipChanged[0];
}
#endif
}
}//find_nearest_cluster

这个函数计算的就是第objectId个数据点到numClusters个中心点的距离，然后根据情况比较更新membership。

这三个函数将所有能够并行的地方都进行了并行，实现了整体算法的并行化~

在此呈上全部代码：

kmeans.h:

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

#ifndef _H_KMEANS
#define _H_KMEANS
#include
#define malloc2D(name, xDim, yDim, type) do { \
name = (type **)malloc(xDim * sizeof(type *)); \
assert(name != NULL); \
name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \
assert(name[0] != NULL); \
for (size_t i = 1; i < xDim; i++) \
name[i] = name[i-1] + yDim; \
} while (0)
double wtime(void);
#endif

wtime.cu:

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

#include
#include
#include
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if (gettimeofday(&etstart, &tzp) == -1)
perror(“Error: calling gettimeofday() not successful.\n”);
now_time = ((double)etstart.tv_sec) + /* in seconds */
((double)etstart.tv_usec) / 1000000.0; /* in microseconds */
return now_time;
}

cuda_kmeans.cu：

[cpp] view plain copy 在CODE上查看代码片派生到我的代码片

#include
#include
#include
#include
#include
#include
#include
#include
#include “kmeans.h”
using namespace std;
const int MAX_CHAR_PER_LINE = 1024;
class KMEANS
{
private:
int numClusters;
int numCoords;
int numObjs;
int *membership;//[numObjs]
char *filename;
float **objects;//[numObjs][numCoords] data objects
float **clusters;//[numClusters][unmCoords] cluster center
float threshold;
int loop_iterations;
public:
KMEANS(int k);
void file_read(char *fn);
void file_write();
void cuda_kmeans();
inline int nextPowerOfTwo(int n);
void free_memory();
virtual ~KMEANS();
};
KMEANS::~KMEANS()
{
free(membership);
free(clusters[0]);
free(clusters);
free(objects[0]);
free(objects);
}
KMEANS::KMEANS(int k)
{
threshold = 0.001;
numObjs = 0;
numCoords = 0;
numClusters = k;
filename = NULL;
loop_iterations = 0;
}
void KMEANS::file_write()
{
FILE *fptr;
char outFileName[1024];
//output:the coordinates of the cluster centres
sprintf(outFileName,”%s.cluster_centres”,filename);
printf(“Writingcoordinates of K=%d cluster centers to file \“%s\“\n”,numClusters,outFileName);
fptr = fopen(outFileName,”w”);
for(int i=0;i<numClusters;i++)
{
fprintf(fptr,”%d “,i) ;
for(int j=0;j<numCoords;j++)
fprintf(fptr,”%f “,clusters[i][j]);
fprintf(fptr,”\n”);
}
fclose(fptr);
//output:the closest cluster centre to each of the data points
sprintf(outFileName,”%s.membership”,filename);
printf(“writing membership of N=%d data objects to file \“%s\“ \n”,numObjs,outFileName);
fptr = fopen(outFileName,”w”);
for(int i=0;i<numObjs;i++)
{
fprintf(fptr,”%d %d\n”,i,membership[i]) ;
}
fclose(fptr);
}
inline int KMEANS::nextPowerOfTwo(int n)
{
n—;
n = n >> 1 | n;
n = n >> 2 | n;
n = n >> 4 | n;
n = n >> 8 | n;
n = n >> 16 | n;
//n = n >> 32 | n; // for 64-bit ints
return ++n;
}
__host__ __device__ inline static
float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId)
{
int i;
float ans = 0;
for( i=0;i<numCoords;i++ )
{
ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) *
( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ;
}
return ans;
}
/*
* numIntermediates:The actual number of intermediates
* numIntermediates2:The next power of two
*/
__global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2)
{
extern __shared__ unsigned int intermediates[];
intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ;
__syncthreads();
//numIntermediates2 *must* be a power of two!
for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1)
{
if(threadIdx.x < s)
{
intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
deviceIntermediates[0] = intermediates[0];
}
}
/*
* objects:[numCoords][numObjs]
* deviceClusters:[numCoords][numClusters]
* membership:[numObjs]
*/
__global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates)
{
extern __shared__ char sharedMemory[];
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
float *clusters = deviceClusters;
membershipChanged[threadIdx.x] = 0;
int objectId = blockDim.x * blockIdx.x + threadIdx.x;
if( objectId < numObjs )
{
int index;
float dist,min_dist;
/*find the cluster id that has min distance to object*/
index = 0;
min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
for(int i=0;i<numClusters;i++)
{
dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ;
/* no need square root */
if( dist < min_dist )
{
min_dist = dist;
index = i;
}
}
if( membership[objectId]!=index )
{
membershipChanged[threadIdx.x] = 1;
}
//assign the membership to object objectId
membership[objectId] = index;
__syncthreads(); //for membershipChanged[]
#if 1
//blockDim.x *must* be a power of two!
for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1)
{
if(threadIdx.x < s)
{
membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0]
}
__syncthreads();
}
if(threadIdx.x == 0)
{
intermediates[blockIdx.x] = membershipChanged[0];
}
#endif
}
}//find_nearest_cluster
void KMEANS::cuda_kmeans()
{
int index,loop = 0;
int *newClusterSize;//[numClusters]:no.objects assigned in each new cluster
float delta; //% of objects changes their clusters
float **dimObjects;//[numCoords][numObjs]
float **dimClusters;
float **newClusters;//[numCoords][numClusters]
float *deviceObjects; //[numCoords][numObjs]
float *deviceClusters; //[numCoords][numclusters]
int *deviceMembership;
int *deviceIntermediates;
//Copy objects given in [numObjs][numCoords] layout to new [numCoords][numObjs] layout
malloc2D(dimObjects,numCoords,numObjs,float);
for(int i=0;i<numCoords;i++)
{
for(int j=0;j<numObjs;j++)
{
dimObjects[i][j] = objects[j][i];
}
}
//pick first numClusters elements of objects[] as initial cluster centers
malloc2D(dimClusters, numCoords, numClusters,float);
for(int i=0;i<numCoords;i++)
{
for(int j=0;j<numClusters;j++)
{
dimClusters[i][j] = dimObjects[i][j];
}
}
newClusterSize = new int[numClusters];
assert(newClusterSize!=NULL);
malloc2D(newClusters,numCoords,numClusters,float);
memset(newClusters[0],0,numCoords * numClusters * sizeof(float) );
//To support reduction,numThreadsPerClusterBlock *must* be a power of two, and it *must* be no larger than the number of bits that will fit into an unsigned char ,the type used to keep track of membership changes in the kernel.
const unsigned int numThreadsPerClusterBlock = 32;
const unsigned int numClusterBlocks = (numObjs + numThreadsPerClusterBlock -1)/numThreadsPerClusterBlock;
const unsigned int numReductionThreads = nextPowerOfTwo(numClusterBlocks);
const unsigned int clusterBlockSharedDataSize = numThreadsPerClusterBlock * sizeof(unsigned char);
const unsigned int reductionBlockSharedDataSize = numReductionThreads * sizeof(unsigned int);
cudaMalloc(&deviceObjects,numObjs*numCoords*sizeof(float));
cudaMalloc(&deviceClusters,numClusters*numCoords*sizeof(float));
cudaMalloc(&deviceMembership,numObjs*sizeof(int));
cudaMalloc(&deviceIntermediates,numReductionThreads*sizeof(unsigned int));
cudaMemcpy(deviceObjects,dimObjects[0],numObjs*numCoords*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceMembership,membership,numObjs*sizeof(int),cudaMemcpyHostToDevice);
do
{
cudaMemcpy(deviceClusters,dimClusters[0],numClusters*numCoords*sizeof(float),cudaMemcpyHostToDevice);
find_nearest_cluster<<>>(numCoords,numObjs,numClusters,deviceObjects,deviceClusters,deviceMembership,deviceIntermediates);
cudaDeviceSynchronize();
compute_delta<<<1,numReductionThreads,reductionBlockSharedDataSize>>>(deviceIntermediates,numClusterBlocks,numReductionThreads);
cudaDeviceSynchronize();
int d;
cudaMemcpy(&d,deviceIntermediates,sizeof(int),cudaMemcpyDeviceToHost);
delta = (float)d;
cudaMemcpy(membership,deviceMembership,numObjs*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<numObjs;i++)
{
//find the array index of nestest
index = membership[i];
//update new cluster centers:sum of objects located within
newClusterSize[index]++;
for(int j=0;j<numCoords;j++)
{
newClusters[j][index] += objects[i][j];
}
}
//average the sum and replace old cluster centers with newClusters
for(int i=0;i<numClusters;i++)
{
for(int j=0;j<numCoords;j++)
{
if(newClusterSize[i] > 0)
dimClusters[j][i] = newClusters[j][i]/newClusterSize[i];
newClusters[j][i] = 0.0;//set back to 0
}
newClusterSize[i] = 0 ; //set back to 0
}
delta /= numObjs;
}while( delta > threshold && loop++ < 500 );
loop_iterations = loop + 1;
malloc2D(clusters,numClusters,numCoords,float);
for(int i=0;i<numClusters;i++)
{
for(int j=0;j<numCoords;j++)
{
clusters[i][j] = dimClusters[j][i];
}
}
cudaFree(deviceObjects) ;
cudaFree(deviceClusters);
cudaFree(deviceMembership);
cudaFree(deviceMembership);
free(dimObjects[0]);
free(dimObjects);
free(dimClusters[0]);
free(dimClusters);
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
}
void KMEANS::file_read(char *fn)
{
FILE *infile;
char *line = new char[MAX_CHAR_PER_LINE];
int lineLen = MAX_CHAR_PER_LINE;
filename = fn;
infile = fopen(filename,”r”);
assert(infile!=NULL);
/*find the number of objects*/
while( fgets(line,lineLen,infile) )
{
numObjs++;
}
/*find the dimension of each object*/
rewind(infile);
while( fgets(line,lineLen,infile)!=NULL )
{
if( strtok(line,” \t\n”)!=0 )
{
while( strtok(NULL,” \t\n”) )
numCoords++;
break;
}
}
/*allocate space for object[][] and read all objcet*/
rewind(infile);
objects = new float*[numObjs];
for(int i=0;i<numObjs;i++)
{
objects[i] = new float[numCoords];
}
int i=0;
/*read all object*/
while( fgets(line,lineLen,infile)!=NULL )
{
if( strtok(line,” \t\n”) ==NULL ) continue;
for(int j=0;j<numCoords;j++)
{
objects[i][j] = atof( strtok(NULL,” ,\t\n”) ) ;
}
i++;
}
/* membership: the cluster id for each data object */
membership = new int[numObjs];
assert(membership!=NULL);
for(int i=0;i<numObjs;i++)
membership[i] = -1;
}
int main(int argc,char *argv[])
{
KMEANS kmeans(atoi(argv[1]));
kmeans.file_read(argv[2]);
kmeans.cuda_kmeans();
kmeans.file_write();
return 0;
}