Merge pull request #581 from karlnapf/master

bugfixes and new tests for quadratic mmd
shogun-toolbox · Jun 12, 2012 · 86919c0 · 86919c0
2 parents df58c63 + 6585589
commit 86919c0
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 49 deletions.
diff --git a/examples/undocumented/libshogun/Makefile b/examples/undocumented/libshogun/Makefile
@@ -80,6 +80,7 @@ TARGETS = basic_minimal \
 		  library_cover_tree \
 		  kernel_machine_train_locked \
 		  statistics \
+		  statistics_quadratic_time_mmd \
 
 all: $(TARGETS)
 

diff --git a/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp b/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp
@@ -0,0 +1,144 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2012 Heiko Strathmann
+ */
+
+#include <shogun/base/init.h>
+#include <shogun/statistics/QuadraticTimeMMD.h>
+#include <shogun/kernel/GaussianKernel.h>
+#include <shogun/features/DenseFeatures.h>
+#include <shogun/mathematics/Statistics.h>
+
+using namespace shogun;
+
+
+void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
+{
+	/* create data matrix for P and Q. P is a standard normal, Q is the same but
+	 * has a mean difference in one dimension */
+	for (index_t i=0; i<target.num_rows; ++i)
+	{
+		for (index_t j=0; j<target.num_cols/2; ++j)
+			target(i,j)=CMath::randn_double();
+
+		/* add mean difference in first dimension of second half of data */
+		for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
+				target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
+	}
+}
+
+/** tests the quadratic mmd statistic for a single data case and ensures
+ * equality with matlab implementation */
+void test_quadratic_mmd_fixed()
+{
+	index_t n=2;
+	index_t d=3;
+	float64_t sigma=2;
+	float64_t sq_sigma_twice=sigma*sigma*2;
+	SGMatrix<float64_t> data(d,2*n);
+	for (index_t i=0; i<2*d*n; ++i)
+		data.matrix[i]=i;
+
+	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
+	CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice);
+	kernel->init(features, features);
+
+	CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, n);
+
+	float64_t difference=CMath::abs(mmd->compute_statistic()-0.051325806508381);
+	ASSERT(difference<=10E-16);
+
+	SG_UNREF(mmd);
+}
+
+void test_quadratic_mmd_bootstrap()
+{
+	index_t dimension=3;
+	index_t m=300;
+	float64_t difference=0.5;
+	float64_t sigma=2;
+
+	SGMatrix<float64_t> data(dimension, 2*m);
+	create_mean_data(data, difference);
+	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
+
+	/* shoguns kernel width is different */
+	CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
+	CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, m);
+	mmd->set_bootstrap_iterations(100);
+	SGVector<float64_t> null_samples=mmd->bootstrap_null();
+
+	null_samples.display_vector();
+	SG_SPRINT("mean %f, var %f\n", CStatistics::mean(null_samples),
+			CStatistics::variance(null_samples));
+
+	SG_UNREF(mmd);
+}
+
+void test_spectrum(CQuadraticTimeMMD* mmd)
+{
+	mmd->set_num_samples_sepctrum(250);
+	mmd->set_num_eigenvalues_spectrum(2);
+	mmd->set_p_value_method(MMD2_SPECTRUM);
+	SG_SPRINT("spectrum: %f\n", mmd->compute_p_value(2));
+}
+
+void test_gamma(CQuadraticTimeMMD* mmd)
+{
+	mmd->set_p_value_method(MMD2_GAMMA);
+	SG_SPRINT("gamma: %f\n", mmd->compute_p_value(2));
+}
+
+/** tests the quadratic mmd statistic for a random data case (fixed distribution
+ * and ensures equality with matlab implementation */
+void test_quadratic_mmd_random()
+{
+	index_t dimension=3;
+	index_t m=300;
+	float64_t difference=0.5;
+	float64_t sigma=2;
+
+	index_t num_runs=100;
+	SGVector<float64_t> mmds(num_runs);
+
+	SGMatrix<float64_t> data(dimension, 2*m);
+
+	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
+
+	/* shoguns kernel width is different */
+	CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
+	CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, features, m);
+	for (index_t i=0; i<num_runs; ++i)
+	{
+		create_mean_data(data, difference);
+		kernel->init(features, features);
+		mmds[i]=mmd->compute_statistic();
+	}
+
+	/* MATLAB 95% mean confidence interval 0.007495841715582 0.037960088792417 */
+	float64_t mean=CStatistics::mean(mmds);
+	ASSERT((mean>0.007495841715582) && (mean<0.037960088792417));
+
+	/* MATLAB variance is 5.800439687240292e-05 quite stable */
+	float64_t variance=CStatistics::variance(mmds);
+	ASSERT(CMath::abs(variance-5.800439687240292e-05)<10E-5);
+	SG_UNREF(mmd);
+}
+
+int main(int argc, char** argv)
+{
+	init_shogun_with_defaults();
+
+	test_quadratic_mmd_fixed();
+	test_quadratic_mmd_random();
+//	test_quadratic_mmd_bootstrap();
+
+	exit_shogun();
+	return 0;
+}
+
+
diff --git a/src/shogun/features/SubsetStack.cpp b/src/shogun/features/SubsetStack.cpp
@@ -81,7 +81,6 @@ void CSubsetStack::add_subset(SGVector<index_t> subset)
 
 		/* get latest current subset */
 		CSubset* latest=(CSubset*)m_active_subsets_stack->get_last_element();
-		SGVector<index_t>::display_vector(latest->m_subset_idx.vector, latest->m_subset_idx.vlen, "latest");
 
 		/* create new index vector */
 		SGVector<index_t> new_active_subset=SGVector<index_t>(subset.vlen);

diff --git a/src/shogun/mathematics/Statistics.h b/src/shogun/mathematics/Statistics.h
@@ -75,7 +75,7 @@ class CStatistics: public CSGObject
 	 * Given probability p, finds the argument t such that stdtr(k,t)
 	 * is equal to p.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t inverse_student_t(int32_t k, float64_t p);
 
@@ -88,7 +88,7 @@ class CStatistics: public CSGObject
 	 * The routine performs interval halving or Newton iterations to find the
 	 * root of incbet(a,b,x) - y = 0.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t inverse_incomplete_beta(float64_t a, float64_t b,
 			float64_t y);
@@ -115,7 +115,7 @@ class CStatistics: public CSGObject
 	 * The integral is evaluated by a continued fraction expansion
 	 * or, when b*x is small, by a power series.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t incomplete_beta(float64_t a, float64_t b, float64_t x);
 
@@ -133,7 +133,7 @@ class CStatistics: public CSGObject
 	 * and the other for y up to exp(-2).  For larger arguments,
 	 * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t inverse_normal_distribution(float64_t y0);
 
@@ -178,7 +178,7 @@ class CStatistics: public CSGObject
 	 * continued fraction expansion, depending on the relative
 	 * values of a and x.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t incomplete_gamma(float64_t a, float64_t x);
 
@@ -203,12 +203,12 @@ class CStatistics: public CSGObject
 	 * continued fraction expansion, depending on the relative
 	 * values of a and x.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t incomplete_gamma_completed(float64_t a, float64_t x);
 
 	/** Evaluates the CDF of the gamma distribution with given parameters a, b
-	 * at x. Based on Wikipedia definition and ALGOLIB routines.
+	 * at x. Based on Wikipedia definition and ALGLIB routines.
 	 *
 	 * @param x position to evaluate
 	 * @param a shape parameter
@@ -248,21 +248,21 @@ class CStatistics: public CSGObject
 	/** Power series for incomplete beta integral.
 	 * Use when b*x is small and x not too close to 1.
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t ibetaf_incompletebetaps(float64_t a, float64_t b,
 			float64_t x, float64_t maxgam);
 
 	/** Continued fraction expansion #1 for incomplete beta integral
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t ibetaf_incompletebetafe(float64_t a, float64_t b,
 			float64_t x, float64_t big, float64_t biginv);
 
 	/** Continued fraction expansion #2 for incomplete beta integral
 	 *
-	 * Taken from ALGOLIB under gpl2+
+	 * Taken from ALGLIB under gpl2+
 	 */
 	static float64_t ibetaf_incompletebetafe2(float64_t a, float64_t b,
 			float64_t x, float64_t big, float64_t biginv);

diff --git a/src/shogun/statistics/QuadraticTimeMMD.cpp b/src/shogun/statistics/QuadraticTimeMMD.cpp
@@ -47,7 +47,7 @@ float64_t CQuadraticTimeMMD::compute_statistic()
 {
 	/* split computations into three terms from JLMR paper (see documentation )*/
 	index_t m=m_q_start;
-	index_t n=m_p_and_q->get_num_vectors();
+	index_t n=m_p_and_q->get_num_vectors()-m_q_start;
 
 	/* init kernel with features */
 	m_kernel->init(m_p_and_q, m_p_and_q);
@@ -56,40 +56,30 @@ float64_t CQuadraticTimeMMD::compute_statistic()
 	float64_t first=0;
 	for (index_t i=0; i<m; ++i)
 	{
+		/* ensure i!=j while adding up */
 		for (index_t j=0; j<m; ++j)
-		{
-			/* ensure i!=j */
-			if (i==j)
-				continue;
-
-			first+=m_kernel->kernel(i,j);
-		}
+			first+=i==j ? 0 :m_kernel->kernel(i,j);
 	}
 	first/=m*(m-1);
 
 	/* second term */
 	float64_t second=0;
-	for (index_t i=m_q_start; i<n; ++i)
+	for (index_t i=m_q_start; i<m_q_start+n; ++i)
 	{
-		for (index_t j=m_q_start; j<n; ++j)
-		{
-			/* ensure i!=j */
-			if (i==j)
-				continue;
-
-			second+=m_kernel->kernel(i,j);
-		}
+		/* ensure i!=j while adding up */
+		for (index_t j=m_q_start; j<m_q_start+n; ++j)
+			second+=i==j ? 0 :m_kernel->kernel(i,j);
 	}
 	second/=n*(n-1);
 
 	/* third term */
 	float64_t third=0;
 	for (index_t i=0; i<m; ++i)
 	{
-		for (index_t j=m_q_start; j<n; ++j)
+		for (index_t j=m_q_start; j<m_q_start+n; ++j)
 			third+=m_kernel->kernel(i,j);
 	}
-	third*=-2.0/(m*n);
+	third*=2.0/(m*n);
 
 	return first+second-third;
 }
@@ -100,18 +90,22 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
 
 	switch (m_p_value_method)
 	{
-#ifdef HAVE_LAPACK
 	case MMD2_SPECTRUM:
 	{
+#ifdef HAVE_LAPACK
 		/* get samples from null-distribution and compute p-value of statistic */
 		SGVector<float64_t> null_samples=sample_null_spectrum(
 				m_num_samples_spectrum, m_num_eigenvalues_spectrum);
 		CMath::qsort(null_samples);
 		float64_t pos=CMath::find_position_to_insert(null_samples, statistic);
 		result=1.0-pos/null_samples.vlen;
+#else // HAVE_LAPACK
+		SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
+				"shogun is compiled with LAPACK enabled\n");
+#endif // HAVE_LAPACK
 		break;
 	}
-#endif // HAVE_LAPACK
+
 	case MMD2_GAMMA:
 		result=compute_p_value_gamma(statistic);
 		break;
@@ -202,7 +196,6 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
 	 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
 	 * works since X and Y are concatenated here */
 	m_kernel->init(m_p_and_q, m_p_and_q);
-	SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();
 
 	/* compute mean under H0 of MMD, which is
 	 * meanMMD  = 2/m * ( 1  - 1/m*sum(diag(KL))  );
@@ -212,13 +205,7 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
 	{
 		/* virtual KL matrix is in upper right corner of SHOGUN K matrix
 		 * so this sums the diagonal of the matrix between X and Y*/
-		mean_mmd+=K(i, m_q_start+i);
-
-		/* remove diagonal of all pairs of kernel matrices on the fly */
-		K(i, i)=0;
-		K(m_q_start+i, m_q_start+i)=0;
-		K(i, m_q_start+i)=0;
-		K(m_q_start+i, i)=0;
+		mean_mmd+=m_kernel->kernel(i, m_q_start+i);
 	}
 	mean_mmd=2.0/m_q_start*(1.0-1.0/m_q_start*mean_mmd);
 
@@ -230,11 +217,14 @@ float64_t CQuadraticTimeMMD::compute_p_value_gamma(float64_t statistic)
 	{
 		for (index_t j=0; j<m_q_start; ++j)
 		{
-			float64_t to_add=0;
-			to_add+=K(i, j);
-			to_add+=K(m_q_start+i, m_q_start+j);
-			to_add-=K(i, m_q_start+j);
-			to_add-=K(m_q_start+i, j);
+			/* dont add diagonal of all pairs of imaginary kernel matrices */
+			if (i==j || m_q_start+i==j || m_q_start+j==i)
+				continue;
+
+			float64_t to_add=m_kernel->kernel(i, j);
+			to_add+=m_kernel->kernel(m_q_start+i, m_q_start+j);
+			to_add-=m_kernel->kernel(i, m_q_start+j);
+			to_add-=m_kernel->kernel(m_q_start+i, j);
 			var_mmd+=CMath::pow(to_add, 2);
 		}
 	}

diff --git a/src/shogun/statistics/QuadraticTimeMMD.h b/src/shogun/statistics/QuadraticTimeMMD.h
@@ -39,6 +39,8 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
 		 * using the Eigen-spectrum of the centered kernel matrix of the merged
 		 * samples of p and q. May be used to compute p_value (easy)
 		 *
+		 * kernel matrix needs to be stored in memory
+		 *
 		 * Works well if the kernel matrix is NOT diagonal dominant.
 		 * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
 		 * A fast, consistent kernel two-sample test.
@@ -62,6 +64,8 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
 		 * Returns the p-value for a given statistic value in the
 		 * null-distribution.
 		 *
+		 * Works for arbritarily large kernel matrices (is not precomputed)
+		 *
 		 * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
 		 * A fast, consistent kernel two-sample test.
 		 *