Skip to content

Commit

Permalink
Merge pull request #405 from pluskid/gsoc-pullrequest-1
Browse files Browse the repository at this point in the history
Accuracy Evaluation for Clustering
  • Loading branch information
Soeren Sonnenburg committed Mar 31, 2012
2 parents 17ea108 + 157f188 commit a1f473c
Show file tree
Hide file tree
Showing 9 changed files with 714 additions and 1 deletion.
67 changes: 67 additions & 0 deletions examples/undocumented/python_modular/evaluation_clustering.py
@@ -0,0 +1,67 @@
##!/usr/bin/env python
# Example on how to evaluate the clustering performance (given ground-truth)

from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Features import Labels
from shogun.Evaluation import ClusteringAccuracy

def get_dataset():
from os.path import exists
from urllib2 import urlopen

filename = "../data/optdigits.tes"
if exists(filename):
return open(filename)
else:
print("Retrieving data...")
return urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")

def prepare_data():
from numpy import loadtxt

stream = get_dataset()
print("Loading data...")
data = loadtxt(stream, delimiter=',')
fea = data[:, :-1]
gnd = data[:, -1]
return (fea.T, gnd)

def run_clustering(data, k):
from shogun.Clustering import KMeans
from shogun.Mathematics import Math_init_random

Math_init_random(42)
fea = RealFeatures(data)
distance = EuclidianDistance(fea, fea)
kmeans=KMeans(k, distance)

print("Running clustering...")
kmeans.train()

return kmeans.get_cluster_centers()

def assign_labels(data, centroids):
from shogun.Classifier import KNN
from numpy import arange

labels = Labels(arange(1.,11.))
fea = RealFeatures(data)
fea_centroids = RealFeatures(centroids)
distance = EuclidianDistance(fea_centroids, fea_centroids)
knn = KNN(1, distance, labels)
knn.train()
return knn.apply(fea)

if __name__ == '__main__':
(fea, gnd_raw) = prepare_data()
centroids = run_clustering(fea, 10)
gnd_hat = assign_labels(fea, centroids)
gnd = Labels(gnd_raw)

AccuracyEval = ClusteringAccuracy()
AccuracyEval.best_map(gnd_hat, gnd)

accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
print('Clustering accuracy = %.4f' % accuracy)

4 changes: 4 additions & 0 deletions src/interfaces/modular/Evaluation.i
Expand Up @@ -11,6 +11,8 @@
/* Remove C Prefix */
%rename(Evaluation) CEvaluation;
%rename(BinaryClassEvaluation) CBinaryClassEvaluation;
%rename(ClusteringEvaluation) CClusteringEvaluation;
%rename(ClusteringAccuracy) CClusteringAccuracy;
%rename(ContingencyTableEvaluation) CContingencyTableEvaluation;
%rename(MulticlassAccuracy) CMulticlassAccuracy;
%rename(MeanAbsoluteError) CMeanAbsoluteError;
Expand All @@ -35,6 +37,8 @@
/* Include Class Headers to make them visible from within the target language */
%include <shogun/evaluation/Evaluation.h>
%include <shogun/evaluation/BinaryClassEvaluation.h>
%include <shogun/evaluation/ClusteringEvaluation.h>
%include <shogun/evaluation/ClusteringAccuracy.h>
%include <shogun/evaluation/ContingencyTableEvaluation.h>
%include <shogun/evaluation/MulticlassAccuracy.h>
%include <shogun/evaluation/MeanAbsoluteError.h>
Expand Down
4 changes: 3 additions & 1 deletion src/interfaces/modular/Evaluation_includes.i
Expand Up @@ -2,11 +2,13 @@
#include <shogun/features/Labels.h>
#include <shogun/evaluation/Evaluation.h>
#include <shogun/evaluation/BinaryClassEvaluation.h>
#include <shogun/evaluation/ClusteringEvaluation.h>
#include <shogun/evaluation/ClusteringAccuracy.h>
#include <shogun/evaluation/ContingencyTableEvaluation.h>
#include <shogun/evaluation/MulticlassAccuracy.h>
#include <shogun/evaluation/MeanAbsoluteError.h>
#include <shogun/evaluation/MeanSquaredError.h>
#include <shogun/evaluation/MeanSquaredLogError.h>
#include <shogun/evaluation/MeanSquaredLogError.h>
#include <shogun/evaluation/ROCEvaluation.h>
#include <shogun/evaluation/PRCEvaluation.h>
#include <shogun/evaluation/CrossValidation.h>
Expand Down
70 changes: 70 additions & 0 deletions src/shogun/evaluation/ClusteringAccuracy.h
@@ -0,0 +1,70 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#ifndef __CLUSTERINGACCURACY_H__
#define __CLUSTERINGACCURACY_H__

#include <shogun/evaluation/ClusteringEvaluation.h>

namespace shogun
{


class CClusteringAccuracy: public CClusteringEvaluation
{
public:
/** constructor */
CClusteringAccuracy(): CClusteringEvaluation() {}

/** destructor */
virtual ~CClusteringAccuracy() {}

/** evaluate labels
* Make sure to call CClusteringEvaluation::best_map to map the predicted label
* before calculating accuracy.
*
* @param predicted labels for evaluating
* @param ground_truth labels assumed to be correct
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth)
{
SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();
int32_t correct=0;
for (int32_t i=0; i < predicted_ilabels.vlen; ++i)
{
if (predicted_ilabels[i] == groundtruth_ilabels[i])
correct++;
}
return float64_t(correct)/predicted_ilabels.vlen;
}

/** @return whether criterium has to be maximized or minimized */
virtual EEvaluationDirection get_evaluation_direction()
{
return ED_MINIMIZE;
}

/** Returns the name of the SGSerializable instance. It MUST BE
* the CLASS NAME without the prefixed `C'.
*
* @return name of the SGSerializable
*/
virtual const char* get_name() const
{
return "ClusteringAccuracy";
}
};

} // namespace shogun

#endif /* end of include guard: __CLUSTERINGACCURACY_H__ */

87 changes: 87 additions & 0 deletions src/shogun/evaluation/ClusteringEvaluation.cpp
@@ -0,0 +1,87 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#include <set>
#include <map>
#include <vector>
#include <algorithm>

#include <shogun/evaluation/ClusteringEvaluation.h>
#include <shogun/mathematics/munkres.h>

using namespace shogun;
using namespace std;

static void unique_labels(CLabels* labels, vector<int32_t>& result)
{
set<int32_t> uniq_lbl;
for (int32_t i=labels->get_num_labels()-1; i >= 0; --i)
{
uniq_lbl.insert(labels->get_int_label(i));
}
result.assign(uniq_lbl.begin(), uniq_lbl.end());
}

static int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
{
int32_t mismatch_count=0;
for (int32_t i=l1.vlen-1; i >= 0; --i)
{
if (l1[i] != m1 || l2[i] != m2)
mismatch_count++;
}

return mismatch_count;
}

void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
{
ASSERT(predicted->get_num_labels() == ground_truth->get_num_labels());
vector<int32_t> label_p, label_g;
unique_labels(predicted, label_p);
unique_labels(ground_truth, label_g);

SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();

int32_t n_class=max(label_p.size(), label_g.size());
SGMatrix<double> G(n_class, n_class);
G.zero();

for (size_t i=0; i < label_g.size(); ++i)
{
for (size_t j=0; j < label_p.size(); ++j)
{
G(i, j)=find_mismatch_count(groundtruth_ilabels, label_g[i],
predicted_ilabels, label_p[j]);
}
}

Munkres munkres_solver(G);
munkres_solver.solve();

map<int32_t, int32_t> label_map;
for (size_t i=0; i < label_p.size(); ++i)
{
for (size_t j=0; j < label_g.size(); ++j)
{
if (G(j, i) == 0)
{
label_map.insert(make_pair(label_p[i], label_g[j]));
break;
}
}
}

for (int32_t i= 0; i < predicted_ilabels.vlen; ++i)
predicted->set_int_label(i, label_map[predicted_ilabels[i]]);

G.destroy_matrix();
}
50 changes: 50 additions & 0 deletions src/shogun/evaluation/ClusteringEvaluation.h
@@ -0,0 +1,50 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#ifndef __CLUSTERINGEVALUATION_H__
#define __CLUSTERINGEVALUATION_H__

#include <shogun/evaluation/Evaluation.h>
#include <shogun/features/Labels.h>

namespace shogun
{

/** @brief The base class used to evaluate clustering
*/
class CClusteringEvaluation: public CEvaluation
{
public:
/** constructor */
CClusteringEvaluation(): CEvaluation() {}

/** destructor */
virtual ~CClusteringEvaluation() {}

/** permute the order of the predicted labels to match the ground_truth as good as possible.
*
* The Munkres assignment algorithm is used to find the best match.
* Note this method perform inplace modification on the parameter predicted
* @param predicted labels for evaluating
* @param ground_truth labels assumed to be correct
*/
void best_map(CLabels* predicted, CLabels* ground_truth);

/** evaluate labels
* @param predicted labels for evaluating
* @param ground_truth labels assumed to be correct
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth) = 0;
};

} // namespace shogun

#endif /* end of include guard: __CLUSTERINGEVALUATION_H__ */
18 changes: 18 additions & 0 deletions src/shogun/lib/DataType.h
Expand Up @@ -365,6 +365,15 @@ template<class T> class SGMatrix
return &matrix[col*num_rows];
}

/** operator overload for matrix read only access
* @param i_row
* @param i_col
*/
inline const T& operator()(index_t i_row, index_t i_col) const
{
return matrix[i_col*num_rows + i_row];
}

/** operator overload for matrix read only access
* @param index to access
*/
Expand All @@ -373,6 +382,15 @@ template<class T> class SGMatrix
return matrix[index];
}

/** operator overload for matrix r/w access
* @param i_row
* @param i_col
*/
inline T& operator()(index_t i_row, index_t i_col)
{
return matrix[i_col*num_rows + i_row];
}

/** operator overload for matrix r/w access
* @param index to access
*/
Expand Down

0 comments on commit a1f473c

Please sign in to comment.