Skip to content

Commit

Permalink
Added normalized mutual information evaluation for clustering.
Browse files Browse the repository at this point in the history
  • Loading branch information
pluskid committed Apr 5, 2012
1 parent 1aa40a1 commit 57d869d
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 13 deletions.
13 changes: 13 additions & 0 deletions examples/undocumented/python_modular/evaluation_clustering.py
Expand Up @@ -5,6 +5,7 @@
from shogun.Features import RealFeatures
from shogun.Features import Labels
from shogun.Evaluation import ClusteringAccuracy
from shogun.Evaluation import ClusteringMutualInformation

def get_dataset():
from os.path import exists
Expand Down Expand Up @@ -62,6 +63,18 @@ def assign_labels(data, centroids):
AccuracyEval = ClusteringAccuracy()
AccuracyEval.best_map(gnd_hat, gnd)

with open('/tmp/foo.txt', 'w') as ous:
for i in range(gnd_hat.get_num_labels()):
ous.write('%d ' % gnd_hat.get_int_label(i))
ous.write('\n')
for i in range(gnd.get_num_labels()):
ous.write('%d ' % gnd.get_int_label(i))
ous.write('\n')

accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
print('Clustering accuracy = %.4f' % accuracy)

MIEval = ClusteringMutualInformation()
mutual_info = MIEval.evaluate(gnd_hat, gnd)
print('Clustering mutual information = %.4f' % mutual_info)

2 changes: 2 additions & 0 deletions src/interfaces/modular/Evaluation.i
Expand Up @@ -13,6 +13,7 @@
%rename(BinaryClassEvaluation) CBinaryClassEvaluation;
%rename(ClusteringEvaluation) CClusteringEvaluation;
%rename(ClusteringAccuracy) CClusteringAccuracy;
%rename(ClusteringMutualInformation) CClusteringMutualInformation;
%rename(ContingencyTableEvaluation) CContingencyTableEvaluation;
%rename(MulticlassAccuracy) CMulticlassAccuracy;
%rename(MeanAbsoluteError) CMeanAbsoluteError;
Expand All @@ -39,6 +40,7 @@
%include <shogun/evaluation/BinaryClassEvaluation.h>
%include <shogun/evaluation/ClusteringEvaluation.h>
%include <shogun/evaluation/ClusteringAccuracy.h>
%include <shogun/evaluation/ClusteringMutualInformation.h>
%include <shogun/evaluation/ContingencyTableEvaluation.h>
%include <shogun/evaluation/MulticlassAccuracy.h>
%include <shogun/evaluation/MeanAbsoluteError.h>
Expand Down
1 change: 1 addition & 0 deletions src/interfaces/modular/Evaluation_includes.i
Expand Up @@ -4,6 +4,7 @@
#include <shogun/evaluation/BinaryClassEvaluation.h>
#include <shogun/evaluation/ClusteringEvaluation.h>
#include <shogun/evaluation/ClusteringAccuracy.h>
#include <shogun/evaluation/ClusteringMutualInformation.h>
#include <shogun/evaluation/ContingencyTableEvaluation.h>
#include <shogun/evaluation/MulticlassAccuracy.h>
#include <shogun/evaluation/MeanAbsoluteError.h>
Expand Down
30 changes: 17 additions & 13 deletions src/shogun/evaluation/ClusteringEvaluation.cpp
Expand Up @@ -19,40 +19,44 @@
using namespace shogun;
using namespace std;

static void unique_labels(CLabels* labels, vector<int32_t>& result)
vector<int32_t> CClusteringEvaluation::unique_labels(CLabels* labels)
{
set<int32_t> uniq_lbl;
std::set<int32_t> uniq_lbl;
for (int32_t i=labels->get_num_labels()-1; i >= 0; --i)
{
uniq_lbl.insert(labels->get_int_label(i));
}
result.assign(uniq_lbl.begin(), uniq_lbl.end());
return std::vector<int32_t>(uniq_lbl.begin(), uniq_lbl.end());
}

static int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
int32_t CClusteringEvaluation::find_match_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
{
int32_t mismatch_count=0;
int32_t match_count=0;
for (int32_t i=l1.vlen-1; i >= 0; --i)
{
if (l1[i] != m1 || l2[i] != m2)
mismatch_count++;
if (l1[i] == m1 && l2[i] == m2)
match_count++;
}

return mismatch_count;
return match_count;
}

int32_t CClusteringEvaluation::find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2)
{
return l1.vlen - find_match_count(l1, m1, l2, m2);
}

void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
{
ASSERT(predicted->get_num_labels() == ground_truth->get_num_labels());
vector<int32_t> label_p, label_g;
unique_labels(predicted, label_p);
unique_labels(ground_truth, label_g);
std::vector<int32_t> label_p=unique_labels(predicted);
std::vector<int32_t> label_g=unique_labels(ground_truth);

SGVector<int32_t> predicted_ilabels=predicted->get_int_labels();
SGVector<int32_t> groundtruth_ilabels=ground_truth->get_int_labels();

int32_t n_class=max(label_p.size(), label_g.size());
SGMatrix<double> G(n_class, n_class);
SGMatrix<float64_t> G(n_class, n_class);
G.zero();

for (size_t i=0; i < label_g.size(); ++i)
Expand All @@ -67,7 +71,7 @@ void CClusteringEvaluation::best_map(CLabels* predicted, CLabels* ground_truth)
Munkres munkres_solver(G);
munkres_solver.solve();

map<int32_t, int32_t> label_map;
std::map<int32_t, int32_t> label_map;
for (size_t i=0; i < label_p.size(); ++i)
{
for (size_t j=0; j < label_g.size(); ++j)
Expand Down
23 changes: 23 additions & 0 deletions src/shogun/evaluation/ClusteringEvaluation.h
Expand Up @@ -43,6 +43,29 @@ class CClusteringEvaluation: public CEvaluation
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth) = 0;
protected:
/** get a vector of unique labels occured
*
* @param labels labels to be investigated
* @return a vector of unique labels
*/
std::vector<int32_t> unique_labels(CLabels* labels);

/** find number of matches in the two labels sequence.
*
* For each index i, if l1[i] == m1 and l2[i] == m2, then we get a match.
* @param l1 the first label sequence to be matched
* @param m1 the first label to match
* @param l2 the second label sequence to be matched
* @param m2 the second label to match
* @return number of matches
*/
int32_t find_match_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2);

/** find number of mismatches in the two labels sequence.
* @see find_match_count
*/
int32_t find_mismatch_count(const SGVector<int32_t>& l1, int32_t m1, const SGVector<int32_t>& l2, int32_t m2);
};

} // namespace shogun
Expand Down
110 changes: 110 additions & 0 deletions src/shogun/evaluation/ClusteringMutualInformation.h
@@ -0,0 +1,110 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Chiyuan Zhang
* Copyright (C) 2012 Chiyuan Zhang
*/

#ifndef __CLUSTERINGMUTUALINFORMATION_H__
#define __CLUSTERINGMUTUALINFORMATION_H__

#include <vector>
#include <cmath>
#include <algorithm>

#include <shogun/evaluation/ClusteringEvaluation.h>

namespace shogun
{

/** @brief clustering (normalized) mutual information
*/
class CClusteringMutualInformation: public CClusteringEvaluation
{
public:
/** constructor */
CClusteringMutualInformation(): CClusteringEvaluation() {}

/** destructor */
virtual ~CClusteringMutualInformation() {}

/** evaluate labels
* Make sure to call CClusteringEvaluation::best_map to map the predicted label
* before calculating mutual information.
*
* @param predicted labels for evaluating
* @param ground_truth labels assumed to be correct
* @return evaluation result
*/
virtual float64_t evaluate(CLabels* predicted, CLabels* ground_truth)
{
std::vector<int32_t> label_p=unique_labels(predicted);
std::vector<int32_t> label_g=unique_labels(ground_truth);

if (label_p.size() != label_g.size())
SG_ERROR("Number of classes are different\n");
uint32_t n_class=label_p.size();
float64_t n_label=predicted->get_num_labels();

SGVector<int32_t> ilabels_p=predicted->get_int_labels();
SGVector<int32_t> ilabels_g=ground_truth->get_int_labels();

SGMatrix<float64_t> G(n_class, n_class);
for (size_t i=0; i < n_class; ++i)
for (size_t j=0; j < n_class; ++j)
G(i, j)=find_match_count(ilabels_g, label_g[i],
ilabels_p, label_p[j])/n_label;
std::vector<float64_t> G_rowsum(n_class), G_colsum(n_class);
for (size_t i=0; i < n_class; ++i)
{
for (size_t j=0; j < n_class; ++j)
{
G_rowsum[i] += G(i, j);
G_colsum[i] += G(j, i);
}
}

float64_t mutual_info = 0;
for (size_t i=0; i < n_class; ++i)
{
for (size_t j=0; j < n_class; ++j)
{
if (G(i, j) != 0)
mutual_info += G(i, j) * log(G(i,j) /
(G_rowsum[i]*G_colsum[j]))/log(2.);
}
}

float64_t entropy_p = 0, entropy_g = 0;
for (size_t i=0; i < n_class; ++i)
{
entropy_g += -G_rowsum[i] * log(G_rowsum[i])/log(2.);
entropy_p += -G_colsum[i] * log(G_colsum[i])/log(2.);
}

return mutual_info / std::max(entropy_g, entropy_p);
}

/** @return whether criterium has to be maximized or minimized */
virtual EEvaluationDirection get_evaluation_direction()
{
return ED_MINIMIZE;
}

/** Returns the name of the SGSerializable instance. It MUST BE
* the CLASS NAME without the prefixed `C'.
*
* @return name of the SGSerializable
*/
virtual const char* get_name() const
{
return "ClusteringMutualInformation";
}
};

}

#endif /* end of include guard: __CLUSTERINGMUTUALINFORMATION_H__ */

0 comments on commit 57d869d

Please sign in to comment.