Reputation: 98
I am using following code to run kmeans algorithm on Iris flower dataset- https://github.com/marcoscastro/kmeans/blob/master/kmeans.cpp
I have modified the above code to read input from files. Below is my code -
#include <iostream>
#include <vector>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <algorithm>
#include <fstream>
using namespace std;
class Point
{
private:
int id_point, id_cluster;
vector<double> values;
int total_values;
string name;
public:
Point(int id_point, vector<double>& values, string name = "")
{
this->id_point = id_point;
total_values = values.size();
for(int i = 0; i < total_values; i++)
this->values.push_back(values[i]);
this->name = name;
this->id_cluster = -1;
}
int getID()
{
return id_point;
}
void setCluster(int id_cluster)
{
this->id_cluster = id_cluster;
}
int getCluster()
{
return id_cluster;
}
double getValue(int index)
{
return values[index];
}
int getTotalValues()
{
return total_values;
}
void addValue(double value)
{
values.push_back(value);
}
string getName()
{
return name;
}
};
class Cluster
{
private:
int id_cluster;
vector<double> central_values;
vector<Point> points;
public:
Cluster(int id_cluster, Point point)
{
this->id_cluster = id_cluster;
int total_values = point.getTotalValues();
for(int i = 0; i < total_values; i++)
central_values.push_back(point.getValue(i));
points.push_back(point);
}
void addPoint(Point point)
{
points.push_back(point);
}
bool removePoint(int id_point)
{
int total_points = points.size();
for(int i = 0; i < total_points; i++)
{
if(points[i].getID() == id_point)
{
points.erase(points.begin() + i);
return true;
}
}
return false;
}
double getCentralValue(int index)
{
return central_values[index];
}
void setCentralValue(int index, double value)
{
central_values[index] = value;
}
Point getPoint(int index)
{
return points[index];
}
int getTotalPoints()
{
return points.size();
}
int getID()
{
return id_cluster;
}
};
class KMeans
{
private:
int K; // number of clusters
int total_values, total_points, max_iterations;
vector<Cluster> clusters;
// return ID of nearest center (uses euclidean distance)
int getIDNearestCenter(Point point)
{
double sum = 0.0, min_dist;
int id_cluster_center = 0;
for(int i = 0; i < total_values; i++)
{
sum += pow(clusters[0].getCentralValue(i) -
point.getValue(i), 2.0);
}
min_dist = sqrt(sum);
for(int i = 1; i < K; i++)
{
double dist;
sum = 0.0;
for(int j = 0; j < total_values; j++)
{
sum += pow(clusters[i].getCentralValue(j) -
point.getValue(j), 2.0);
}
dist = sqrt(sum);
if(dist < min_dist)
{
min_dist = dist;
id_cluster_center = i;
}
}
return id_cluster_center;
}
public:
KMeans(int K, int total_points, int total_values, int max_iterations)
{
this->K = K;
this->total_points = total_points;
this->total_values = total_values;
this->max_iterations = max_iterations;
}
void run(vector<Point> & points)
{
if(K > total_points)
return;
vector<int> prohibited_indexes;
printf("Inside run \n");
// choose K distinct values for the centers of the clusters
printf(" K distinct cluster\n");
for(int i = 0; i < K; i++)
{
while(true)
{
int index_point = rand() % total_points;
if(find(prohibited_indexes.begin(), prohibited_indexes.end(),
index_point) == prohibited_indexes.end())
{
printf("i= %d\n",i);
prohibited_indexes.push_back(index_point);
points[index_point].setCluster(i);
Cluster cluster(i, points[index_point]);
clusters.push_back(cluster);
break;
}
}
}
int iter = 1;
printf(" Each point to nearest cluster\n");
while(true)
{
bool done = true;
// associates each point to the nearest center
for(int i = 0; i < total_points; i++)
{
int id_old_cluster = points[i].getCluster();
int id_nearest_center = getIDNearestCenter(points[i]);
if(id_old_cluster != id_nearest_center)
{
if(id_old_cluster != -1)
clusters[id_old_cluster].removePoint(points[i].getID());
points[i].setCluster(id_nearest_center);
clusters[id_nearest_center].addPoint(points[i]);
done = false;
}
}
// recalculating the center of each cluster
for(int i = 0; i < K; i++)
{
for(int j = 0; j < total_values; j++)
{
int total_points_cluster = clusters[i].getTotalPoints();
double sum = 0.0;
if(total_points_cluster > 0)
{
for(int p = 0; p < total_points_cluster; p++)
sum += clusters[i].getPoint(p).getValue(j);
clusters[i].setCentralValue(j, sum / total_points_cluster);
}
}
}
if(done == true || iter >= max_iterations)
{
cout << "Break in iteration " << iter << "\n\n";
break;
}
iter++;
}
// shows elements of clusters
for(int i = 0; i < K; i++)
{
int total_points_cluster = clusters[i].getTotalPoints();
cout << "Cluster " << clusters[i].getID() + 1 << endl;
for(int j = 0; j < total_points_cluster; j++)
{
cout << "Point " << clusters[i].getPoint(j).getID() + 1 << ": ";
for(int p = 0; p < total_values; p++)
cout << clusters[i].getPoint(j).getValue(p) << " ";
string point_name = clusters[i].getPoint(j).getName();
if(point_name != "")
cout << "- " << point_name;
cout << endl;
}
cout << "Cluster values: ";
for(int j = 0; j < total_values; j++)
cout << clusters[i].getCentralValue(j) << " ";
cout << "\n\n";
}
}
};
int main(int argc, char *argv[])
{
srand(time(NULL));
int total_points, total_values, K, max_iterations, has_name;
ifstream inFile("datafile.txt");
if (!inFile) {
cerr << "Unable to open file datafile.txt";
exit(1); // call system to stop
}
inFile >> total_points >> total_values >> K >> max_iterations >> has_name;
cout << "Details- \n";
vector<Point> points;
string point_name,str;
int i=0;
while(inFile.eof())
{
string temp;
vector<double> values;
for(int j = 0; j < total_values; j++)
{
double value;
inFile >> value;
values.push_back(value);
}
if(has_name)
{
inFile >> point_name;
Point p(i, values, point_name);
points.push_back(p);
i++;
}
else
{
inFile >> temp;
Point p(i, values);
points.push_back(p);
i++;
}
}
inFile.close();
KMeans kmeans(K, total_points, total_values, max_iterations);
kmeans.run(points);
return 0;
}
Output of code is -
Details-
15043100000Inside run
K distinct cluster i= 0
Segmentation fault
When I run it in gdb, the error shown is -
Program received signal SIGSEGV, Segmentation fault.
0x0000000000401db6 in Point::setCluster (this=0x540, id_cluster=0)
at kmeans.cpp:41
41 this->id_cluster = id_cluster;
I am stuck at this as I cannot find the cause for this segmentation fault.
My dataset file looks like -
150 4 3 10000 1
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
. . .
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
. . .
Upvotes: 1
Views: 190
Reputation: 3875
in KMeans::run(vector<Point>&)
you call points[index_point].setCluster(i);
without any guarantee that index_point
is within bounds.
index_point
is determined by int index_point = rand() % total_points;
, and total_points
is retrieved from the input file "datafile.txt" which could be anything. It certainly does not have to match points.size()
, but it should. Make sure it does, or just use points.size()
instead.
A bit offtopic, but using rand()
and only using modulo is almost always wrong. If you use C++11 or newer, please consider using std::uniform_int_distribution.
Upvotes: 1
Reputation: 11934
points[index_point].setCluster(i);
could be accessing the vector out of bounds. The code you quoted actually always sets a number of total_points
in the vector points
before calling run
, while your modified code just reads until end of file and has no guarantees that the number of total points passed to the constructor of KMeans
matches the value of entries in points
. Either fix your file I/O or fix the logic of bounds checking.
Upvotes: 0