我问这个问题是为了做 vector 量化的作业。
我实现了一种相当经典的算法来检测点簇的中心。然而,在输入数据中,有几个簇(簇的数量和总输入已知),我需要找到每个簇的中心,但我不知道哪些点构成一个簇。因此,如果我设法在集群内部或附近的某个位置(比任何其他初始化中心更接近)初始化 future 的中心点,我的算法可以迭代并转到正确的中心。
但是我不知道如何正确初始化。我正在随机初始化并检查两个中心是否彼此太近以及中心是否距离任何输入点太远,但此方法不容易参数化,即花费大量“计算”时间或无法获得正确的中心.
我的想法很简单,随机初始化并检查该点是否在簇内。有人知道我该怎么做吗?我无法构造多边形,因为我不知道簇的限制。 我更喜欢用 C 语言实现,但我也只接受这些想法!
编辑:输入数据示例:
我的代码:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <float.h>
#define TRAINING_CYCLE 10000
#define LEARNING_RATE 0.001
#define CENTROID_DISTANCE_SCALE 0.7 //used for setting a minimal distance between centroids
#define CENTROID_POINT_SCALE 0.1
#define CLUSTER_SIZE_PERCENTAGE 0.3
//User: REMOVED
//Password: REMOVED
typedef enum { false, true } bool;
typedef struct point{
double x;
double y;
} point;
int nbOfClusters;
double in1[1000];
double in2[1000];
point centers[100]; //later it is limited by number of clusters
int dataSize=0;
double maxX1, maxY1, maxX2, maxY2=0; //maximums of each data set
double deltaX, deltaY=0; //error toleration of each axis
double getAbs(double n){
if (n>=0){
return n;
} else {
return (-1)*n;
}
}
int findNearestCentroid(point p1){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (p1.x-centers[i].x)*(p1.x-centers[i].x)+(p1.y-centers[i].y)*(p1.y-centers[i].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
return nearest;
}
double getDistance(point p1, point p2){
return sqrt((p1.x-p2.x)*(p1.x-p2.x)+(p1.y-p2.y)*(p1.y-p2.y));
}
bool isCentroidsNear(double minDistance){
for (int i=0;i<nbOfClusters;i++){
for (int j=0; j<nbOfClusters; j++){
if (i != j){
double temp_distance=getDistance(centers[i],centers[j]);
if (temp_distance<minDistance){ // the distance shouldn't be small
return true;
}
}
}
}
return false; //if nothing hit the condition, there is no centroid too close to another
}
point findNearestInput(int centroid){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
point returnPoint;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (in1[i]-centers[centroid].x)*(in1[i]-centers[centroid].x)+(in2[i]-centers[centroid].y)*(in2[i]-centers[centroid].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
returnPoint.x=in1[nearest];
returnPoint.y=in2[nearest];
return returnPoint;
}
bool isPointNear(double minDistance){
for(int i=0; i<nbOfClusters; i++){
double distance=getDistance(findNearestInput(i),centers[i]); //the distance to the nearest point
if(distance>minDistance){
return true;
}
}
return false;
}
bool isCountNearPoints(double distance){
int counter=0;
for(int i=0;i<nbOfClusters;i++){
point p;
for(int j=0; j<dataSize; j++){
p.x=in1[j];
p.y=in2[j];
double tempDistance=getDistance(p,centers[i]);
if (tempDistance<distance){
counter++;
}
}
//this is the number of points that the centroid should be near to
int minNearPoints = dataSize/nbOfClusters*CLUSTER_SIZE_PERCENTAGE;
if (counter<minNearPoints){
return true;
}
}
return false;
}
int main()
{
char dummy[1];
scanf("%c",&dummy[0]);
nbOfClusters=dummy[0]-'0';
while ( scanf("%lf,%lf", &in1[dataSize], &in2[dataSize]) != EOF){
dataSize++;
}
//finding the maximums to determine the error toleration delta
for(int i =0; i< dataSize; i++){
if(in1[i]>0 && in1[i] > maxX1){
maxX1=in1[i];
}
if(in2[i]>0 && in2[i]>maxY1){
maxY1=in2[i];
}
if(in1[i]<0 && in1[i] < maxX1){
maxX2=in1[i];
}
if(in2[i]<0 && in2[i] < maxY1){
maxY2=in2[i];
}
}
//double minDistance = CENTROID_DISTANCE_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double minDistance = 1/nbOfClusters*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double pointMinDistance = CENTROID_POINT_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
/*
do { //randomly generate centroids but have finally nothing near
for(int i=0; i<nbOfClusters; i++){
centers[i].x=(double)rand()/RAND_MAX*2*(maxX1-maxX2)-(maxX1-maxX2);
centers[i].y=(double)rand()/RAND_MAX*2*(maxY1-maxY2)-(maxY1-maxY2);
}
//} while(isCentroidsNear(minDistance) || isCountNearPoints(pointMinDistance));
} while(isCentroidsNear(minDistance) || isPointNear(pointMinDistance));
//} while(isCentroidsNear(minDistance));
*/
int randomInputs[50];
bool isSame;
//generating nbOfClusters amount of random numbers from dataSize range that will later used to pick inputs
do {
do{
//generate random numbers
for(int i=0; i<nbOfClusters; i++){
randomInputs[i]=(int)((double)rand()/RAND_MAX*dataSize);
}
isSame = false;
//checking if the generated numbers are the same
for(int i=0; i<nbOfClusters-1; i++){
for(int j=i+1; j<nbOfClusters; j++){
if(randomInputs[i]==randomInputs[j] ){
isSame=true;
break;
}
}
if(isSame){
break;
}
}
}while(isSame);
//assign centroids to the generated numbers
for (int i =0;i<nbOfClusters;i++){
centers[i].x=in1[randomInputs[i]];
centers[i].y=in2[randomInputs[i]];
}
}while(isCentroidsNear(minDistance)); //if the centroids are too close, i.e. in the same cluster
//learning
point p1;//point for iteration
for (int ii=0; ii<TRAINING_CYCLE; ii++){
for (int i=0; i<dataSize; i++){
//construct a point
p1.x=in1[i];
p1.y=in2[i];
//find the nearest point and the distance to it
int nearPt=findNearestCentroid(p1);
double distance=getDistance(p1,centers[nearPt]);
//the distance that I want to move it
double deltaDistance=LEARNING_RATE*distance;
//moving the center on the DIRECTION of the other point
//the slope of the line passing through both
double slope=(in2[i]-centers[nearPt].y)/(in1[i]-centers[nearPt].x);
double dx,dy;
// finding how much the x needs to change => totalchange^2=dx^2+dy^2 but I know dy from dx
dx=sqrt(deltaDistance*deltaDistance/(1+slope*slope)); //dx=(totaldist^2/(1+slope^2)
//dx is always positive till now, so it should be neg. if the center is to the right of the point
if(centers[nearPt].x>in1[i]){
dx=(-1)*dx;
}
dy=slope*dx;
//updating the center value
centers[nearPt].x += dx;
centers[nearPt].y += dy;
}
}
//printing the results
for (int i=0; i<nbOfClusters; i++){
printf("%lf,%lf\n",centers[i].x,centers[i].y);
}
return 0;
}
最佳答案
通常的方法是从现有数据中选择点,而不是均匀随机。
由于在您的数据模型中,每个点都属于一个簇,因此选择现有点可以解决您的(模糊)问题,不是吗?
关于c - 如何检查一个点是否在点簇内,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41696135/