Merge pull request #405 from pluskid/gsoc-pullrequest-1

Accuracy Evaluation for Clustering
shogun-toolbox · Mar 31, 2012 · a1f473c · a1f473c
2 parents 17ea108 + 157f188
commit a1f473c
Show file tree

Hide file tree

Showing 9 changed files with 714 additions and 1 deletion.
diff --git a/examples/undocumented/python_modular/evaluation_clustering.py b/examples/undocumented/python_modular/evaluation_clustering.py
@@ -0,0 +1,67 @@
+##!/usr/bin/env python
+# Example on how to evaluate the clustering performance (given ground-truth)
+
+from shogun.Distance import EuclidianDistance
+from shogun.Features import RealFeatures
+from shogun.Features import Labels
+from shogun.Evaluation import ClusteringAccuracy
+
+def get_dataset():
+    from os.path import exists
+    from urllib2 import urlopen
+
+    filename = "../data/optdigits.tes"
+    if exists(filename):
+        return open(filename)
+    else:
+        print("Retrieving data...")
+        return urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")
+
+def prepare_data():
+    from numpy import loadtxt
+
+    stream = get_dataset()
+    print("Loading data...")
+    data = loadtxt(stream, delimiter=',')
+    fea = data[:, :-1]
+    gnd = data[:, -1]
+    return (fea.T, gnd)
+
+def run_clustering(data, k):
+    from shogun.Clustering import KMeans
+    from shogun.Mathematics import Math_init_random
+
+    Math_init_random(42)
+    fea = RealFeatures(data)
+    distance = EuclidianDistance(fea, fea)
+    kmeans=KMeans(k, distance)
+
+    print("Running clustering...")
+    kmeans.train()
+
+    return kmeans.get_cluster_centers()
+
+def assign_labels(data, centroids):
+    from shogun.Classifier import KNN
+    from numpy import arange
+
+    labels = Labels(arange(1.,11.))
+    fea = RealFeatures(data)
+    fea_centroids = RealFeatures(centroids)
+    distance = EuclidianDistance(fea_centroids, fea_centroids)
+    knn = KNN(1, distance, labels)
+    knn.train()
+    return knn.apply(fea)
+
+if __name__ == '__main__':
+    (fea, gnd_raw) = prepare_data()
+    centroids = run_clustering(fea, 10)
+    gnd_hat = assign_labels(fea, centroids)
+    gnd = Labels(gnd_raw)
+
+    AccuracyEval = ClusteringAccuracy()
+    AccuracyEval.best_map(gnd_hat, gnd)
+
+    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
+    print('Clustering accuracy = %.4f' % accuracy)
+
diff --git a/src/interfaces/modular/Evaluation.i b/src/interfaces/modular/Evaluation.i
@@ -11,6 +11,8 @@
 /* Remove C Prefix */
 %rename(Evaluation) CEvaluation;
 %rename(BinaryClassEvaluation) CBinaryClassEvaluation;
+%rename(ClusteringEvaluation) CClusteringEvaluation;
+%rename(ClusteringAccuracy) CClusteringAccuracy;
 %rename(ContingencyTableEvaluation) CContingencyTableEvaluation;
 %rename(MulticlassAccuracy) CMulticlassAccuracy;
 %rename(MeanAbsoluteError) CMeanAbsoluteError;
@@ -35,6 +37,8 @@
 /* Include Class Headers to make them visible from within the target language */
 %include <shogun/evaluation/Evaluation.h>
 %include <shogun/evaluation/BinaryClassEvaluation.h>
+%include <shogun/evaluation/ClusteringEvaluation.h>
+%include <shogun/evaluation/ClusteringAccuracy.h>
 %include <shogun/evaluation/ContingencyTableEvaluation.h>
 %include <shogun/evaluation/MulticlassAccuracy.h>
 %include <shogun/evaluation/MeanAbsoluteError.h>

diff --git a/src/interfaces/modular/Evaluation_includes.i b/src/interfaces/modular/Evaluation_includes.i
@@ -2,11 +2,13 @@
  #include <shogun/features/Labels.h>
  #include <shogun/evaluation/Evaluation.h>
  #include <shogun/evaluation/BinaryClassEvaluation.h>
+ #include <shogun/evaluation/ClusteringEvaluation.h>
+ #include <shogun/evaluation/ClusteringAccuracy.h>
  #include <shogun/evaluation/ContingencyTableEvaluation.h>
  #include <shogun/evaluation/MulticlassAccuracy.h>
  #include <shogun/evaluation/MeanAbsoluteError.h>
  #include <shogun/evaluation/MeanSquaredError.h>
-  #include <shogun/evaluation/MeanSquaredLogError.h>
+ #include <shogun/evaluation/MeanSquaredLogError.h>
  #include <shogun/evaluation/ROCEvaluation.h>
  #include <shogun/evaluation/PRCEvaluation.h>
  #include <shogun/evaluation/CrossValidation.h>

diff --git a/src/shogun/evaluation/ClusteringAccuracy.h b/src/shogun/evaluation/ClusteringAccuracy.h
@@ -0,0 +1,70 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2012 Chiyuan Zhang
+ * Copyright (C) 2012 Chiyuan Zhang
+ */
+
+#ifndef __CLUSTERINGACCURACY_H__
+#define __CLUSTERINGACCURACY_H__
+
+#include <shogun/evaluation/ClusteringEvaluation.h>
+
+namespace shogun
+{
+
+
+class CClusteringAccuracy: public CClusteringEvaluation
+{
+public:
+	/** constructor */
+	CClusteringAccuracy(): CClusteringEvaluation() {}
+
+	/** destructor */
+	virtual ~CClusteringAccuracy() {}
+
+	/** evaluate labels
+	 * Make sure to call CClusteringEvaluation::best_map to map the predicted label
+	 * before calculating accuracy.
+	 *
+	 * @param predicted labels for evaluating
+	 * @param ground_truth labels assumed to be correct
+	 * @return evaluation result
+	 */
+	virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth)
+	{
+		SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
+		SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();
+		int32_t correct=0;
+		for (int32_t i=0; i < predicted_ilabels.vlen; ++i)
+		{
+			if (predicted_ilabels[i] == groundtruth_ilabels[i])
+				correct++;
+		}
+		return float64_t(correct)/predicted_ilabels.vlen;
+	}
+
+	/** @return whether criterium has to be maximized or minimized */
+	virtual EEvaluationDirection get_evaluation_direction()
+	{
+		return ED_MINIMIZE;
+	}
+
+	/** Returns the name of the SGSerializable instance.  It MUST BE
+	 *  the CLASS NAME without the prefixed `C'.
+	 *
+	 *  @return name of the SGSerializable
+	 */
+	virtual const char* get_name() const
+	{
+		return "ClusteringAccuracy";
+	}
+};
+
+} // namespace shogun
+
+#endif /* end of include guard: __CLUSTERINGACCURACY_H__ */
+
diff --git a/src/shogun/evaluation/ClusteringEvaluation.cpp b/src/shogun/evaluation/ClusteringEvaluation.cpp
@@ -0,0 +1,87 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2012 Chiyuan Zhang
+ * Copyright (C) 2012 Chiyuan Zhang
+ */
+
+#include <set>
+#include <map>
+#include <vector>
+#include <algorithm>
+
+#include <shogun/evaluation/ClusteringEvaluation.h>
+#include <shogun/mathematics/munkres.h>
+
+using namespace shogun;
+using namespace std;
+
+static void unique_labels(CLabels* labels, vector<int32_t>& result)
+{
+	set<int32_t> uniq_lbl;
+	for (int32_t i=labels->get_num_labels()-1; i >= 0; --i)
+	{
+		uniq_lbl.insert(labels->get_int_label(i));
+	}
+	result.assign(uniq_lbl.begin(), uniq_lbl.end());
+}
+
+static int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
+{
+	int32_t mismatch_count=0;
+	for (int32_t i=l1.vlen-1; i >= 0; --i)
+	{
+		if (l1[i] != m1 || l2[i] != m2)
+			mismatch_count++;
+	}
+
+	return mismatch_count;
+}
+
+void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
+{
+	ASSERT(predicted->get_num_labels() == ground_truth->get_num_labels());
+	vector<int32_t> label_p, label_g;
+	unique_labels(predicted, label_p);
+	unique_labels(ground_truth, label_g);
+
+	SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
+	SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();
+
+	int32_t n_class=max(label_p.size(), label_g.size());
+	SGMatrix<double> G(n_class, n_class);
+	G.zero();
+
+	for (size_t i=0; i < label_g.size(); ++i)
+	{
+		for (size_t j=0; j < label_p.size(); ++j)
+		{
+			G(i, j)=find_mismatch_count(groundtruth_ilabels, label_g[i],
+				predicted_ilabels, label_p[j]);
+		}
+	}
+
+	Munkres munkres_solver(G);
+	munkres_solver.solve();
+
+	map<int32_t, int32_t> label_map;
+	for (size_t i=0; i < label_p.size(); ++i)
+	{
+		for (size_t j=0; j < label_g.size(); ++j)
+		{
+			if (G(j, i) == 0)
+			{
+				label_map.insert(make_pair(label_p[i], label_g[j]));
+				break;
+			}
+		}
+	}
+
+	for (int32_t i= 0; i < predicted_ilabels.vlen; ++i)
+		predicted->set_int_label(i, label_map[predicted_ilabels[i]]);
+
+	G.destroy_matrix();
+}
diff --git a/src/shogun/evaluation/ClusteringEvaluation.h b/src/shogun/evaluation/ClusteringEvaluation.h
@@ -0,0 +1,50 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2012 Chiyuan Zhang
+ * Copyright (C) 2012 Chiyuan Zhang
+ */
+
+#ifndef __CLUSTERINGEVALUATION_H__
+#define __CLUSTERINGEVALUATION_H__
+
+#include <shogun/evaluation/Evaluation.h>
+#include <shogun/features/Labels.h>
+
+namespace shogun
+{
+
+/** @brief The base class used to evaluate clustering
+ */
+class CClusteringEvaluation: public CEvaluation
+{
+public:
+	/** constructor */
+	CClusteringEvaluation(): CEvaluation() {}
+
+	/** destructor */
+	virtual ~CClusteringEvaluation() {}
+
+	/** permute the order of the predicted labels to match the ground_truth as good as possible.
+	 *
+	 * The Munkres assignment algorithm is used to find the best match.
+	 * Note this method perform inplace modification on the parameter predicted
+	 * @param predicted labels for evaluating
+	 * @param ground_truth labels assumed to be correct
+	 */
+	void best_map(CLabels* predicted, CLabels* ground_truth);
+
+	/** evaluate labels
+	 * @param predicted labels for evaluating
+	 * @param ground_truth labels assumed to be correct
+	 * @return evaluation result
+	 */
+	virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth) = 0;
+};
+
+} // namespace shogun
+
+#endif /* end of include guard: __CLUSTERINGEVALUATION_H__ */
diff --git a/src/shogun/lib/DataType.h b/src/shogun/lib/DataType.h
@@ -365,6 +365,15 @@ template<class T> class SGMatrix
 			return &matrix[col*num_rows];
 		}
 
+        /** operator overload for matrix read only access
+         * @param i_row
+         * @param i_col
+         */
+        inline const T& operator()(index_t i_row, index_t i_col) const
+        {
+            return matrix[i_col*num_rows + i_row];
+        }
+
 		/** operator overload for matrix read only access
 		 * @param index to access
 		 */
@@ -373,6 +382,15 @@ template<class T> class SGMatrix
 			return matrix[index];
 		}
 
+        /** operator overload for matrix r/w access
+         * @param i_row
+         * @param i_col
+         */
+        inline T& operator()(index_t i_row, index_t i_col)
+        {
+            return matrix[i_col*num_rows + i_row];
+        }
+
 		/** operator overload for matrix r/w access
 		 * @param index to access
 		 */