Merge pull request #613 from karlnapf/master

a new python example and some fixes
shogun-toolbox · Jun 29, 2012 · e9b94ce · e9b94ce
2 parents db38a5d + 1de3fce
commit e9b94ce
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 5 deletions.
diff --git a/examples/undocumented/libshogun/statistics_linear_time_mmd.cpp b/examples/undocumented/libshogun/statistics_linear_time_mmd.cpp
@@ -191,6 +191,12 @@ void test_linear_mmd_type2_error()
 	for (index_t i=0; i<num_runs; ++i)
 	{
 		create_mean_data(data, difference);
+
+		/* technically, this leads to a wrong result since training (statistic)
+		 * and testing (p-value) have to happen on different data, but this
+		 * is only to compare against MATLAB, where I did the same "mistake"
+		 * See for example python_modular example how to do this correct
+		 * Note that this is only when using Gaussian approximation */
 		float64_t statistic=mmd->compute_statistic();
 
 		float64_t p_value_est=mmd->compute_p_value(statistic);

diff --git a/examples/undocumented/python_modular/statistics_linear_time_mmd.py b/examples/undocumented/python_modular/statistics_linear_time_mmd.py
@@ -15,7 +15,7 @@ def statistics_linear_time_mmd():
 	from shogun.Features import RealFeatures
 	from shogun.Kernel import GaussianKernel
 	from shogun.Statistics import LinearTimeMMD
-	from shogun.Statistics import BOOTSTRAP
+	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
 
 	import matplotlib.pyplot as plt
 
@@ -60,8 +60,20 @@ def statistics_linear_time_mmd():
 	features_y=RealFeatures(Y)
 	mmd=LinearTimeMMD(kernel,features_x, features_y)
 
-	p_value=mmd.compute_p_value(statistic)
+	# do the same thing using two different way to approximate null-dstribution
+	# bootstrapping and gaussian approximation (ony for really large samples)
 	alpha=0.05
+
+	print "computing p-value using bootstrapping"
+	mmd.set_null_approximation_method(BOOTSTRAP)
+	mmd.set_bootstrap_iterations(500)
+	p_value=mmd.compute_p_value(statistic)
+	print "p_value:", p_value
+	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+
+	print "computing p-value using gaussian approximation"
+	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
+	p_value=mmd.compute_p_value(statistic)
 	print "p_value:", p_value
 	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
 

diff --git a/examples/undocumented/python_modular/statistics_quadratic_time_mmd.py b/examples/undocumented/python_modular/statistics_quadratic_time_mmd.py
@@ -0,0 +1,105 @@
+#
+# This program is free software you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (C) 2012 Heiko Strathmann
+#
+from numpy import *
+from tools.two_distributions_data import TwoDistributionsData
+
+gen_data=TwoDistributionsData()
+
+def statistics_linear_time_mmd():
+	from shogun.Features import RealFeatures
+	from shogun.Kernel import GaussianKernel
+	from shogun.Statistics import QuadraticTimeMMD
+	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
+
+	import matplotlib.pyplot as plt
+
+	# note that the quadratic time mmd has sometimes to store kernel matrices
+	# which upper bounds the sample size massively
+	n=500
+	dim=2
+	difference=0.5
+
+	# data is standard normal distributed. only one dimension of Y has a mean
+	# shift of difference
+	(X,Y)=gen_data.create_mean_data(n,dim,difference)
+
+	print "dimension means of X", [mean(x) for x in X]
+	print "dimension means of Y", [mean(x) for x in Y]
+
+	# create shogun feature representation
+	features_x=RealFeatures(X)
+	features_y=RealFeatures(Y)
+
+	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
+	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
+	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
+	kernel=GaussianKernel(10,8)
+
+	mmd=QuadraticTimeMMD(kernel,features_x, features_y)
+
+	# perform test: compute p-value and test if null-hypothesis is rejected for
+	# a test level of 0.05 using different methods to approximate
+	# null-distribution
+	statistic=mmd.compute_statistic()
+	alpha=0.05
+
+	print "computing p-value using bootstrapping"
+	mmd.set_null_approximation_method(BOOTSTRAP)
+	# normally, at least 250 iterations should be done, but that takes long
+	mmd.set_bootstrap_iterations(10)
+	# bootstrapping allows usage of unbiased or biased statistic
+	mmd.set_statistic_type(UNBIASED)
+	p_value=mmd.compute_p_value(statistic)
+	print "p_value:", p_value
+	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+
+	print "computing p-value using spectrum method"
+	mmd.set_null_approximation_method(MMD2_SPECTRUM)
+	# normally, at least 250 iterations should be done, but that takes long
+	mmd.set_num_samples_sepctrum(50)
+	mmd.set_num_eigenvalues_spectrum(n-10)
+	# spectrum method computes p-value for biased statistics only
+	mmd.set_statistic_type(BIASED)
+	p_value=mmd.compute_p_value(statistic)
+	print "p_value:", p_value
+	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+
+	print "computing p-value using gamma method"
+	mmd.set_null_approximation_method(MMD2_GAMMA)
+	# gamma method computes p-value for biased statistics only
+	mmd.set_statistic_type(BIASED)
+	p_value=mmd.compute_p_value(statistic)
+	print "p_value:", p_value
+	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
+
+	# sample from null distribution (these may be plotted or whatsoever)
+	# mean should be close to zero, variance stronly depends on data/kernel
+	# bootstrapping, biased statistic
+	print "sampling null distribution using bootstrapping"
+	mmd.set_null_approximation_method(BOOTSTRAP)
+	mmd.set_statistic_type(BIASED)
+	mmd.set_bootstrap_iterations(10)
+	null_samples=mmd.bootstrap_null()
+	print "null mean:", mean(null_samples)
+	print "null variance:", var(null_samples)
+
+	# sample from null distribution (these may be plotted or whatsoever)
+	# mean should be close to zero, variance stronly depends on data/kernel
+	# spectrum, biased statistic
+	print "sampling null distribution using spectrum method"
+	mmd.set_null_approximation_method(MMD2_SPECTRUM)
+	mmd.set_statistic_type(BIASED)
+	# 200 samples using 100 eigenvalues
+	null_samples=mmd.sample_null_spectrum(200,100)
+	print "null mean:", mean(null_samples)
+	print "null variance:", var(null_samples)
+
+if __name__=='__main__':
+	print('LinearTimeMMD')
+	statistics_linear_time_mmd()
diff --git a/src/shogun/statistics/LinearTimeMMD.h b/src/shogun/statistics/LinearTimeMMD.h
@@ -76,7 +76,8 @@ class CLinearTimeMMD: public CKernelTwoSampleTestStatistic
 	/** Computes the p-value for a given statistic. The method for computing
 	 * the p-value can be set via set_p_value_method() method. Since the null-
 	 * distribution is normal, a Gaussian approximation is available along with
-	 * bootstrapping
+	 * bootstrapping. For Gaussian approximation, training and test data have
+	 * to be DIFFERENT samples from same distribution
 	 *
 	 * @param statistic statistic to compute the p-value for
 	 *

diff --git a/src/shogun/statistics/QuadraticTimeMMD.h b/src/shogun/statistics/QuadraticTimeMMD.h
@@ -159,15 +159,15 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
 		float64_t compute_p_value_gamma(float64_t statistic);
 
 		/** setter for number of samples to use in spectrum based p-value
-		 * computation
+		 * computation.
 		 *
 		 * @param num_samples_spectrum number of samples to draw from
 		 * approximate null-distributrion
 		 */
 		void set_num_samples_sepctrum(index_t num_samples_spectrum);
 
 		/** setter for number of eigenvalues to use in spectrum based p-value
-		 * computation
+		 * computation. Maximum is 2*m_q_start-1
 		 *
 		 * @param num_eigenvalues_spectrum number of eigenvalues to use to
 		 * approximate null-distributrion