Skip to content

Commit

Permalink
Merge pull request #581 from karlnapf/master
Browse files Browse the repository at this point in the history
bugfixes and new tests for quadratic mmd
  • Loading branch information
karlnapf committed Jun 12, 2012
2 parents df58c63 + 6585589 commit 86919c0
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 49 deletions.
1 change: 1 addition & 0 deletions examples/undocumented/libshogun/Makefile
Expand Up @@ -80,6 +80,7 @@ TARGETS = basic_minimal \
library_cover_tree \
kernel_machine_train_locked \
statistics \
statistics_quadratic_time_mmd \

all: $(TARGETS)

Expand Down
144 changes: 144 additions & 0 deletions examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp
@@ -0,0 +1,144 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Heiko Strathmann
*/

#include <shogun/base/init.h>
#include <shogun/statistics/QuadraticTimeMMD.h>
#include <shogun/kernel/GaussianKernel.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/mathematics/Statistics.h>

using namespace shogun;


void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
{
/* create data matrix for P and Q. P is a standard normal, Q is the same but
* has a mean difference in one dimension */
for (index_t i=0; i<target.num_rows; ++i)
{
for (index_t j=0; j<target.num_cols/2; ++j)
target(i,j)=CMath::randn_double();

/* add mean difference in first dimension of second half of data */
for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
}
}

/** tests the quadratic mmd statistic for a single data case and ensures
* equality with matlab implementation */
void test_quadratic_mmd_fixed()
{
index_t n=2;
index_t d=3;
float64_t sigma=2;
float64_t sq_sigma_twice=sigma*sigma*2;
SGMatrix<float64_t> data(d,2*n);
for (index_t i=0; i<2*d*n; ++i)
data.matrix[i]=i;

CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice);
kernel->init(features, features);

CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, n);

float64_t difference=CMath::abs(mmd->compute_statistic()-0.051325806508381);
ASSERT(difference<=10E-16);

SG_UNREF(mmd);
}

void test_quadratic_mmd_bootstrap()
{
index_t dimension=3;
index_t m=300;
float64_t difference=0.5;
float64_t sigma=2;

SGMatrix<float64_t> data(dimension, 2*m);
create_mean_data(data, difference);
CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);

/* shoguns kernel width is different */
CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, m);
mmd->set_bootstrap_iterations(100);
SGVector<float64_t> null_samples=mmd->bootstrap_null();

null_samples.display_vector();
SG_SPRINT("mean %f, var %f\n", CStatistics::mean(null_samples),
CStatistics::variance(null_samples));

SG_UNREF(mmd);
}

void test_spectrum(CQuadraticTimeMMD* mmd)
{
mmd->set_num_samples_sepctrum(250);
mmd->set_num_eigenvalues_spectrum(2);
mmd->set_p_value_method(MMD2_SPECTRUM);
SG_SPRINT("spectrum: %f\n", mmd->compute_p_value(2));
}

void test_gamma(CQuadraticTimeMMD* mmd)
{
mmd->set_p_value_method(MMD2_GAMMA);
SG_SPRINT("gamma: %f\n", mmd->compute_p_value(2));
}

/** tests the quadratic mmd statistic for a random data case (fixed distribution
* and ensures equality with matlab implementation */
void test_quadratic_mmd_random()
{
index_t dimension=3;
index_t m=300;
float64_t difference=0.5;
float64_t sigma=2;

index_t num_runs=100;
SGVector<float64_t> mmds(num_runs);

SGMatrix<float64_t> data(dimension, 2*m);

CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);

/* shoguns kernel width is different */
CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, m);
for (index_t i=0; i<num_runs; ++i)
{
create_mean_data(data, difference);
kernel->init(features, features);
mmds[i]=mmd->compute_statistic();
}

/* MATLAB 95% mean confidence interval 0.007495841715582 0.037960088792417 */
float64_t mean=CStatistics::mean(mmds);
ASSERT((mean>0.007495841715582) && (mean<0.037960088792417));

/* MATLAB variance is 5.800439687240292e-05 quite stable */
float64_t variance=CStatistics::variance(mmds);
ASSERT(CMath::abs(variance-5.800439687240292e-05)<10E-5);
SG_UNREF(mmd);
}

int main(int argc, char** argv)
{
init_shogun_with_defaults();

test_quadratic_mmd_fixed();
test_quadratic_mmd_random();
// test_quadratic_mmd_bootstrap();

exit_shogun();
return 0;
}


1 change: 0 additions & 1 deletion src/shogun/features/SubsetStack.cpp
Expand Up @@ -81,7 +81,6 @@ void CSubsetStack::add_subset(SGVector<index_t> subset)

/* get latest current subset */
CSubset* latest=(CSubset*)m_active_subsets_stack->get_last_element();
SGVector<index_t>::display_vector(latest->m_subset_idx.vector, latest->m_subset_idx.vlen, "latest");

/* create new index vector */
SGVector<index_t> new_active_subset=SGVector<index_t>(subset.vlen);
Expand Down
20 changes: 10 additions & 10 deletions src/shogun/mathematics/Statistics.h
Expand Up @@ -75,7 +75,7 @@ class CStatistics: public CSGObject
* Given probability p, finds the argument t such that stdtr(k,t)
* is equal to p.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t inverse_student_t(int32_t k, float64_t p);

Expand All @@ -88,7 +88,7 @@ class CStatistics: public CSGObject
* The routine performs interval halving or Newton iterations to find the
* root of incbet(a,b,x) - y = 0.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t inverse_incomplete_beta(float64_t a, float64_t b,
float64_t y);
Expand All @@ -115,7 +115,7 @@ class CStatistics: public CSGObject
* The integral is evaluated by a continued fraction expansion
* or, when b*x is small, by a power series.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t incomplete_beta(float64_t a, float64_t b, float64_t x);

Expand All @@ -133,7 +133,7 @@ class CStatistics: public CSGObject
* and the other for y up to exp(-2). For larger arguments,
* w = y - 0.5, and x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t inverse_normal_distribution(float64_t y0);

Expand Down Expand Up @@ -178,7 +178,7 @@ class CStatistics: public CSGObject
* continued fraction expansion, depending on the relative
* values of a and x.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t incomplete_gamma(float64_t a, float64_t x);

Expand All @@ -203,12 +203,12 @@ class CStatistics: public CSGObject
* continued fraction expansion, depending on the relative
* values of a and x.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t incomplete_gamma_completed(float64_t a, float64_t x);

/** Evaluates the CDF of the gamma distribution with given parameters a, b
* at x. Based on Wikipedia definition and ALGOLIB routines.
* at x. Based on Wikipedia definition and ALGLIB routines.
*
* @param x position to evaluate
* @param a shape parameter
Expand Down Expand Up @@ -248,21 +248,21 @@ class CStatistics: public CSGObject
/** Power series for incomplete beta integral.
* Use when b*x is small and x not too close to 1.
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t ibetaf_incompletebetaps(float64_t a, float64_t b,
float64_t x, float64_t maxgam);

/** Continued fraction expansion #1 for incomplete beta integral
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t ibetaf_incompletebetafe(float64_t a, float64_t b,
float64_t x, float64_t big, float64_t biginv);

/** Continued fraction expansion #2 for incomplete beta integral
*
* Taken from ALGOLIB under gpl2+
* Taken from ALGLIB under gpl2+
*/
static float64_t ibetaf_incompletebetafe2(float64_t a, float64_t b,
float64_t x, float64_t big, float64_t biginv);
Expand Down
58 changes: 24 additions & 34 deletions src/shogun/statistics/QuadraticTimeMMD.cpp
Expand Up @@ -47,7 +47,7 @@ float64_t CQuadraticTimeMMD::compute_statistic()
{
/* split computations into three terms from JLMR paper (see documentation )*/
index_t m=m_q_start;
index_t n=m_p_and_q->get_num_vectors();
index_t n=m_p_and_q->get_num_vectors()-m_q_start;

/* init kernel with features */
m_kernel->init(m_p_and_q, m_p_and_q);
Expand All @@ -56,40 +56,30 @@ float64_t CQuadraticTimeMMD::compute_statistic()
float64_t first=0;
for (index_t i=0; i<m; ++i)
{
/* ensure i!=j while adding up */
for (index_t j=0; j<m; ++j)
{
/* ensure i!=j */
if (i==j)
continue;

first+=m_kernel->kernel(i,j);
}
first+=i==j ? 0 :m_kernel->kernel(i,j);
}
first/=m*(m-1);

/* second term */
float64_t second=0;
for (index_t i=m_q_start; i<n; ++i)
for (index_t i=m_q_start; i<m_q_start+n; ++i)
{
for (index_t j=m_q_start; j<n; ++j)
{
/* ensure i!=j */
if (i==j)
continue;

second+=m_kernel->kernel(i,j);
}
/* ensure i!=j while adding up */
for (index_t j=m_q_start; j<m_q_start+n; ++j)
second+=i==j ? 0 :m_kernel->kernel(i,j);
}
second/=n*(n-1);

/* third term */
float64_t third=0;
for (index_t i=0; i<m; ++i)
{
for (index_t j=m_q_start; j<n; ++j)
for (index_t j=m_q_start; j<m_q_start+n; ++j)
third+=m_kernel->kernel(i,j);
}
third*=-2.0/(m*n);
third*=2.0/(m*n);

return first+second-third;
}
Expand All @@ -100,18 +90,22 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)

switch (m_p_value_method)
{
#ifdef HAVE_LAPACK
case MMD2_SPECTRUM:
{
#ifdef HAVE_LAPACK
/* get samples from null-distribution and compute p-value of statistic */
SGVector<float64_t> null_samples=sample_null_spectrum(
m_num_samples_spectrum, m_num_eigenvalues_spectrum);
CMath::qsort(null_samples);
float64_t pos=CMath::find_position_to_insert(null_samples, statistic);
result=1.0-pos/null_samples.vlen;
#else // HAVE_LAPACK
SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
"shogun is compiled with LAPACK enabled\n");
#endif // HAVE_LAPACK
break;
}
#endif // HAVE_LAPACK

case MMD2_GAMMA:
result=compute_p_value_gamma(statistic);
break;
Expand Down Expand Up @@ -202,7 +196,6 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
* K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
* works since X and Y are concatenated here */
m_kernel->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();

/* compute mean under H0 of MMD, which is
* meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) );
Expand All @@ -212,13 +205,7 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
{
/* virtual KL matrix is in upper right corner of SHOGUN K matrix
* so this sums the diagonal of the matrix between X and Y*/
mean_mmd+=K(i, m_q_start+i);

/* remove diagonal of all pairs of kernel matrices on the fly */
K(i, i)=0;
K(m_q_start+i, m_q_start+i)=0;
K(i, m_q_start+i)=0;
K(m_q_start+i, i)=0;
mean_mmd+=m_kernel->kernel(i, m_q_start+i);
}
mean_mmd=2.0/m_q_start*(1.0-1.0/m_q_start*mean_mmd);

Expand All @@ -230,11 +217,14 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
{
for (index_t j=0; j<m_q_start; ++j)
{
float64_t to_add=0;
to_add+=K(i, j);
to_add+=K(m_q_start+i, m_q_start+j);
to_add-=K(i, m_q_start+j);
to_add-=K(m_q_start+i, j);
/* dont add diagonal of all pairs of imaginary kernel matrices */
if (i==j || m_q_start+i==j || m_q_start+j==i)
continue;

float64_t to_add=m_kernel->kernel(i, j);
to_add+=m_kernel->kernel(m_q_start+i, m_q_start+j);
to_add-=m_kernel->kernel(i, m_q_start+j);
to_add-=m_kernel->kernel(m_q_start+i, j);
var_mmd+=CMath::pow(to_add, 2);
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/shogun/statistics/QuadraticTimeMMD.h
Expand Up @@ -39,6 +39,8 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
* using the Eigen-spectrum of the centered kernel matrix of the merged
* samples of p and q. May be used to compute p_value (easy)
*
* kernel matrix needs to be stored in memory
*
* Works well if the kernel matrix is NOT diagonal dominant.
* See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
* A fast, consistent kernel two-sample test.
Expand All @@ -62,6 +64,8 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
* Returns the p-value for a given statistic value in the
* null-distribution.
*
* Works for arbritarily large kernel matrices (is not precomputed)
*
* See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
* A fast, consistent kernel two-sample test.
*
Expand Down

0 comments on commit 86919c0

Please sign in to comment.