Merge pull request #669 from karlnapf/master

Make examples use data generator class
shogun-toolbox · Jul 23, 2012 · 62c71cf · 62c71cf
2 parents 28824e9 + 5c16e8c
commit 62c71cf
Show file tree

Hide file tree

Showing 8 changed files with 33 additions and 118 deletions.
diff --git a/examples/undocumented/libshogun/statistics_hsic.cpp b/examples/undocumented/libshogun/statistics_hsic.cpp
@@ -15,21 +15,6 @@
 
 using namespace shogun;
 
-void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
-{
-	/* create data matrix for P and Q. P is a standard normal, Q is the same but
-	 * has a mean difference in one dimension */
-	for (index_t i=0; i<target.num_rows; ++i)
-	{
-		for (index_t j=0; j<target.num_cols/2; ++j)
-			target(i,j)=CMath::randn_double();
-
-		/* add mean difference in first dimension of second half of data */
-		for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
-				target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
-	}
-}
-
 void create_fixed_data_kernel_small(CFeatures*& features_p,
 		CFeatures*& features_q, CKernel*& kernel_p, CKernel*& kernel_q)
 {

diff --git a/examples/undocumented/libshogun/statistics_linear_time_mmd.cpp b/examples/undocumented/libshogun/statistics_linear_time_mmd.cpp
@@ -11,26 +11,11 @@
 #include <shogun/statistics/LinearTimeMMD.h>
 #include <shogun/kernel/GaussianKernel.h>
 #include <shogun/features/DenseFeatures.h>
+#include <shogun/features/DataGenerator.h>
 #include <shogun/mathematics/Statistics.h>
 
 using namespace shogun;
 
-
-void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
-{
-	/* create data matrix for P and Q. P is a standard normal, Q is the same but
-	 * has a mean difference in one dimension */
-	for (index_t i=0; i<target.num_rows; ++i)
-	{
-		for (index_t j=0; j<target.num_cols/2; ++j)
-			target(i,j)=CMath::randn_double();
-
-		/* add mean difference in first dimension of second half of data */
-		for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
-				target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
-	}
-}
-
 /** tests the linear mmd statistic for a single data case and ensures
  * equality with matlab implementation */
 void test_linear_mmd_fixed()
@@ -81,7 +66,8 @@ void test_linear_mmd_random()
 
 	for (index_t i=0; i<num_runs; ++i)
 	{
-		create_mean_data(data, difference);
+		CDataGenerator::generate_mean_data(m, dimension, difference,
+				data.matrix);
 		mmds[i]=mmd->compute_statistic();
 	}
 
@@ -122,7 +108,8 @@ void test_linear_mmd_variance_estimate()
 
 	for (index_t i=0; i<num_runs; ++i)
 	{
-		create_mean_data(data, difference);
+		CDataGenerator::generate_mean_data(m, dimension, difference,
+				data.matrix);
 		vars[i]=mmd->compute_variance_estimate();
 	}
 
@@ -150,9 +137,9 @@ void test_linear_mmd_variance_estimate_vs_bootstrap()
 	float64_t difference=0.5;
 	float64_t sigma=2;
 
-	SGMatrix<float64_t> data(dimension, 2*m);
+	SGMatrix<float64_t> data=CDataGenerator::generate_mean_data(m, dimension,
+			difference);;
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
-	create_mean_data(data, difference);
 
 	/* shoguns kernel width is different */
 	CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
@@ -201,7 +188,8 @@ void test_linear_mmd_type2_error()
 
 	for (index_t i=0; i<num_runs; ++i)
 	{
-		create_mean_data(data, difference);
+		CDataGenerator::generate_mean_data(m, dimension, difference,
+				data.matrix);
 
 		/* technically, this leads to a wrong result since training (statistic)
 		 * and testing (p-value) have to happen on different data, but this

diff --git a/examples/undocumented/libshogun/statistics_linear_time_mmd_kernel_choice.cpp b/examples/undocumented/libshogun/statistics_linear_time_mmd_kernel_choice.cpp
@@ -17,21 +17,6 @@
 
 using namespace shogun;
 
-void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
-{
-	/* create data matrix for P and Q. P is a standard normal, Q is the same but
-	 * has a mean difference in one dimension */
-	for (index_t i=0; i<target.num_rows; ++i)
-	{
-		for (index_t j=0; j<target.num_cols/2; ++j)
-			target(i,j)=CMath::randn_double();
-
-		/* add mean difference in first dimension of second half of data */
-		for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
-				target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
-	}
-}
-
 SGMatrix<float64_t> create_fixed_data(index_t m, index_t dim)
 {
 	SGMatrix<float64_t> data(dim,2*m);

diff --git a/examples/undocumented/python_modular/statistics_linear_time_mmd.py b/examples/undocumented/python_modular/statistics_linear_time_mmd.py
@@ -7,12 +7,10 @@
 # Written (C) 2012 Heiko Strathmann
 #
 from numpy import *
-from tools.two_distributions_data import TwoDistributionsData
-
-gen_data=TwoDistributionsData()
 
 def statistics_linear_time_mmd():
 	from shogun.Features import RealFeatures
+	from shogun.Features import DataGenerator
 	from shogun.Kernel import GaussianKernel
 	from shogun.Statistics import LinearTimeMMD
 	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
@@ -22,42 +20,29 @@ def statistics_linear_time_mmd():
 	dim=2
 	difference=0.5
 
-	# data is standard normal distributed. only one dimension of Y has a mean
-	# shift of difference
+	# use data generator class to produce example data
 	# in pratice, this generate data function could be replaced by a method
 	# that obtains data from a stream
-	(X,Y)=gen_data.create_mean_data(n,dim,difference)
-
-	print "dimension means of X", [mean(x) for x in X]
-	print "dimension means of Y", [mean(x) for x in Y]
+	data=DataGenerator.generate_mean_data(n,dim,difference)
+	
+	print "dimension means of X", mean(data.T[0:n].T)
+	print "dimension means of Y", mean(data.T[n:2*n+1].T)
 
 	# create shogun feature representation
-	features_x=RealFeatures(X)
-	features_y=RealFeatures(Y)
+	features=RealFeatures(data)
 
 	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
 	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
 	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
 	kernel=GaussianKernel(10,8)
 
-	mmd=LinearTimeMMD(kernel,features_x, features_y)
+	mmd=LinearTimeMMD(kernel,features, n)
 
 	# perform test: compute p-value and test if null-hypothesis is rejected for
 	# a test level of 0.05
-	# for the linear time mmd, the statistic has to be computed on different
-	# data than the p-value, so first, compute statistic, and then compute
-	# p-value on other data
-	# this annoying property is since the null-distribution should stay normal
-	# which is not the case if "training/test" data would be the same
 	statistic=mmd.compute_statistic()
 	print "test statistic:", statistic
 
-	# generate new data (same distributions as old) and new statistic object
-	(X,Y)=gen_data.create_mean_data(n,dim,difference)
-	features_x=RealFeatures(X)
-	features_y=RealFeatures(Y)
-	mmd=LinearTimeMMD(kernel,features_x, features_y)
-
 	# do the same thing using two different way to approximate null-dstribution
 	# bootstrapping and gaussian approximation (ony for really large samples)
 	alpha=0.05

diff --git a/examples/undocumented/python_modular/statistics_linear_time_mmd_kernel_choice.py b/examples/undocumented/python_modular/statistics_linear_time_mmd_kernel_choice.py
@@ -8,14 +8,12 @@
 #
 from numpy import *
 #from matplotlib import pyplot
-from tools.two_distributions_data import TwoDistributionsData
-
-gen_data=TwoDistributionsData()
 
 # performs learning of optimal non-negative kernel weights for a linear time
 # two sample test using the linear time Maximum Mean Discrepancy
 def statistics_linear_time_mmd_kernel_choice():
 	from shogun.Features import RealFeatures, CombinedFeatures
+	from shogun.Features import DataGenerator
 	from shogun.Kernel import GaussianKernel, CombinedKernel
 	from shogun.Statistics import LinearTimeMMD
 	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
@@ -25,15 +23,13 @@ def statistics_linear_time_mmd_kernel_choice():
 	dim=5
 	difference=2
 
-	# data is standard normal distributed. only one dimension of Y has a mean
-	# shift of difference
-	(X,Y)=gen_data.create_mean_data(n,dim,difference)
+	# use data generator class to produce example data
+	# in pratice, this generate data function could be replaced by a method
+	# that obtains data from a stream
+	data=DataGenerator.generate_mean_data(n,dim,difference)
 
-	# concatenate since MMD class takes data as one feature object
-	# (it is possible to give two, but then data is copied)
-	Z=concatenate((X,Y), axis=1)
-	print "dimension means of X", [mean(x) for x in X]
-	print "dimension means of Y", [mean(x) for x in Y]
+	print "dimension means of X", mean(data.T[0:n].T)
+	print "dimension means of Y", mean(data.T[n:2*n+1].T)
 
 	# create kernels/features to choose from
 	# here: just a bunch of Gaussian Kernels with different widths
@@ -52,7 +48,7 @@ def statistics_linear_time_mmd_kernel_choice():
 	# all kernels work on same features
 	for i in range(len(sigmas)):
 		kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i]))
-		features.append_feature_obj(RealFeatures(Z))
+		features.append_feature_obj(RealFeatures(data))
 
 	mmd=LinearTimeMMD(kernel,features, n)
 

diff --git a/examples/undocumented/python_modular/statistics_quadratic_time_mmd.py b/examples/undocumented/python_modular/statistics_quadratic_time_mmd.py
@@ -7,12 +7,10 @@
 # Written (C) 2012 Heiko Strathmann
 #
 from numpy import *
-from tools.two_distributions_data import TwoDistributionsData
-
-gen_data=TwoDistributionsData()
 
 def statistics_linear_time_mmd():
 	from shogun.Features import RealFeatures
+	from shogun.Features import DataGenerator
 	from shogun.Kernel import GaussianKernel
 	from shogun.Statistics import QuadraticTimeMMD
 	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
@@ -23,23 +21,21 @@ def statistics_linear_time_mmd():
 	dim=2
 	difference=0.5
 
-	# data is standard normal distributed. only one dimension of Y has a mean
-	# shift of difference
-	(X,Y)=gen_data.create_mean_data(n,dim,difference)
+	# use data generator class to produce example data
+	data=DataGenerator.generate_mean_data(n,dim,difference)
 
-	print "dimension means of X", [mean(x) for x in X]
-	print "dimension means of Y", [mean(x) for x in Y]
+	print "dimension means of X", mean(data.T[0:n].T)
+	print "dimension means of Y", mean(data.T[n:2*n+1].T)
 
 	# create shogun feature representation
-	features_x=RealFeatures(X)
-	features_y=RealFeatures(Y)
+	features=RealFeatures(data)
 
 	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
 	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
 	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
 	kernel=GaussianKernel(10,8)
 
-	mmd=QuadraticTimeMMD(kernel,features_x, features_y)
+	mmd=QuadraticTimeMMD(kernel,features, n)
 
 	# perform test: compute p-value and test if null-hypothesis is rejected for
 	# a test level of 0.05 using different methods to approximate

diff --git a/examples/undocumented/python_modular/tools/two_distributions_data.py b/examples/undocumented/python_modular/tools/two_distributions_data.py
diff --git a/src/shogun/features/DataGenerator.h b/src/shogun/features/DataGenerator.h
@@ -32,7 +32,7 @@ class CDataGenerator: public CSGObject
 	 *
 	 * @param m number of samples to generate
 	 * @param dim dimension of generated samples
-	 * @param mean_shift is added mean of first dimension
+	 * @param mean_shift is added to mean of first dimension
 	 * @target_data if non-NULL then this is used as matrix data storage. Make
 	 * sure that its dimensions fit
 	 * @return matrix with concatenated samples,first p then q