Skip to content

Commit

Permalink
Merge pull request #669 from karlnapf/master
Browse files Browse the repository at this point in the history
Make examples use data generator class
  • Loading branch information
karlnapf committed Jul 23, 2012
2 parents 28824e9 + 5c16e8c commit 62c71cf
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 118 deletions.
15 changes: 0 additions & 15 deletions examples/undocumented/libshogun/statistics_hsic.cpp
Expand Up @@ -15,21 +15,6 @@

using namespace shogun;

void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
{
/* create data matrix for P and Q. P is a standard normal, Q is the same but
* has a mean difference in one dimension */
for (index_t i=0; i<target.num_rows; ++i)
{
for (index_t j=0; j<target.num_cols/2; ++j)
target(i,j)=CMath::randn_double();

/* add mean difference in first dimension of second half of data */
for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
}
}

void create_fixed_data_kernel_small(CFeatures*& features_p,
CFeatures*& features_q, CKernel*& kernel_p, CKernel*& kernel_q)
{
Expand Down
30 changes: 9 additions & 21 deletions examples/undocumented/libshogun/statistics_linear_time_mmd.cpp
Expand Up @@ -11,26 +11,11 @@
#include <shogun/statistics/LinearTimeMMD.h>
#include <shogun/kernel/GaussianKernel.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/features/DataGenerator.h>
#include <shogun/mathematics/Statistics.h>

using namespace shogun;


void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
{
/* create data matrix for P and Q. P is a standard normal, Q is the same but
* has a mean difference in one dimension */
for (index_t i=0; i<target.num_rows; ++i)
{
for (index_t j=0; j<target.num_cols/2; ++j)
target(i,j)=CMath::randn_double();

/* add mean difference in first dimension of second half of data */
for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
}
}

/** tests the linear mmd statistic for a single data case and ensures
* equality with matlab implementation */
void test_linear_mmd_fixed()
Expand Down Expand Up @@ -81,7 +66,8 @@ void test_linear_mmd_random()

for (index_t i=0; i<num_runs; ++i)
{
create_mean_data(data, difference);
CDataGenerator::generate_mean_data(m, dimension, difference,
data.matrix);
mmds[i]=mmd->compute_statistic();
}

Expand Down Expand Up @@ -122,7 +108,8 @@ void test_linear_mmd_variance_estimate()

for (index_t i=0; i<num_runs; ++i)
{
create_mean_data(data, difference);
CDataGenerator::generate_mean_data(m, dimension, difference,
data.matrix);
vars[i]=mmd->compute_variance_estimate();
}

Expand Down Expand Up @@ -150,9 +137,9 @@ void test_linear_mmd_variance_estimate_vs_bootstrap()
float64_t difference=0.5;
float64_t sigma=2;

SGMatrix<float64_t> data(dimension, 2*m);
SGMatrix<float64_t> data=CDataGenerator::generate_mean_data(m, dimension,
difference);;
CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
create_mean_data(data, difference);

/* shoguns kernel width is different */
CGaussianKernel* kernel=new CGaussianKernel(100, sigma*sigma*2);
Expand Down Expand Up @@ -201,7 +188,8 @@ void test_linear_mmd_type2_error()

for (index_t i=0; i<num_runs; ++i)
{
create_mean_data(data, difference);
CDataGenerator::generate_mean_data(m, dimension, difference,
data.matrix);

/* technically, this leads to a wrong result since training (statistic)
* and testing (p-value) have to happen on different data, but this
Expand Down
Expand Up @@ -17,21 +17,6 @@

using namespace shogun;

void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
{
/* create data matrix for P and Q. P is a standard normal, Q is the same but
* has a mean difference in one dimension */
for (index_t i=0; i<target.num_rows; ++i)
{
for (index_t j=0; j<target.num_cols/2; ++j)
target(i,j)=CMath::randn_double();

/* add mean difference in first dimension of second half of data */
for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
}
}

SGMatrix<float64_t> create_fixed_data(index_t m, index_t dim)
{
SGMatrix<float64_t> data(dim,2*m);
Expand Down
31 changes: 8 additions & 23 deletions examples/undocumented/python_modular/statistics_linear_time_mmd.py
Expand Up @@ -7,12 +7,10 @@
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from tools.two_distributions_data import TwoDistributionsData

gen_data=TwoDistributionsData()

def statistics_linear_time_mmd():
from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
Expand All @@ -22,42 +20,29 @@ def statistics_linear_time_mmd():
dim=2
difference=0.5

# data is standard normal distributed. only one dimension of Y has a mean
# shift of difference
# use data generator class to produce example data
# in pratice, this generate data function could be replaced by a method
# that obtains data from a stream
(X,Y)=gen_data.create_mean_data(n,dim,difference)

print "dimension means of X", [mean(x) for x in X]
print "dimension means of Y", [mean(x) for x in Y]
data=DataGenerator.generate_mean_data(n,dim,difference)
print "dimension means of X", mean(data.T[0:n].T)
print "dimension means of Y", mean(data.T[n:2*n+1].T)

# create shogun feature representation
features_x=RealFeatures(X)
features_y=RealFeatures(Y)
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)

mmd=LinearTimeMMD(kernel,features_x, features_y)
mmd=LinearTimeMMD(kernel,features, n)

# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05
# for the linear time mmd, the statistic has to be computed on different
# data than the p-value, so first, compute statistic, and then compute
# p-value on other data
# this annoying property is since the null-distribution should stay normal
# which is not the case if "training/test" data would be the same
statistic=mmd.compute_statistic()
print "test statistic:", statistic

# generate new data (same distributions as old) and new statistic object
(X,Y)=gen_data.create_mean_data(n,dim,difference)
features_x=RealFeatures(X)
features_y=RealFeatures(Y)
mmd=LinearTimeMMD(kernel,features_x, features_y)

# do the same thing using two different way to approximate null-dstribution
# bootstrapping and gaussian approximation (ony for really large samples)
alpha=0.05
Expand Down
Expand Up @@ -8,14 +8,12 @@
#
from numpy import *
#from matplotlib import pyplot
from tools.two_distributions_data import TwoDistributionsData

gen_data=TwoDistributionsData()

# performs learning of optimal non-negative kernel weights for a linear time
# two sample test using the linear time Maximum Mean Discrepancy
def statistics_linear_time_mmd_kernel_choice():
from shogun.Features import RealFeatures, CombinedFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel, CombinedKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
Expand All @@ -25,15 +23,13 @@ def statistics_linear_time_mmd_kernel_choice():
dim=5
difference=2

# data is standard normal distributed. only one dimension of Y has a mean
# shift of difference
(X,Y)=gen_data.create_mean_data(n,dim,difference)
# use data generator class to produce example data
# in pratice, this generate data function could be replaced by a method
# that obtains data from a stream
data=DataGenerator.generate_mean_data(n,dim,difference)

# concatenate since MMD class takes data as one feature object
# (it is possible to give two, but then data is copied)
Z=concatenate((X,Y), axis=1)
print "dimension means of X", [mean(x) for x in X]
print "dimension means of Y", [mean(x) for x in Y]
print "dimension means of X", mean(data.T[0:n].T)
print "dimension means of Y", mean(data.T[n:2*n+1].T)

# create kernels/features to choose from
# here: just a bunch of Gaussian Kernels with different widths
Expand All @@ -52,7 +48,7 @@ def statistics_linear_time_mmd_kernel_choice():
# all kernels work on same features
for i in range(len(sigmas)):
kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i]))
features.append_feature_obj(RealFeatures(Z))
features.append_feature_obj(RealFeatures(data))

mmd=LinearTimeMMD(kernel,features, n)

Expand Down
Expand Up @@ -7,12 +7,10 @@
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from tools.two_distributions_data import TwoDistributionsData

gen_data=TwoDistributionsData()

def statistics_linear_time_mmd():
from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
Expand All @@ -23,23 +21,21 @@ def statistics_linear_time_mmd():
dim=2
difference=0.5

# data is standard normal distributed. only one dimension of Y has a mean
# shift of difference
(X,Y)=gen_data.create_mean_data(n,dim,difference)
# use data generator class to produce example data
data=DataGenerator.generate_mean_data(n,dim,difference)

print "dimension means of X", [mean(x) for x in X]
print "dimension means of Y", [mean(x) for x in Y]
print "dimension means of X", mean(data.T[0:n].T)
print "dimension means of Y", mean(data.T[n:2*n+1].T)

# create shogun feature representation
features_x=RealFeatures(X)
features_y=RealFeatures(Y)
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)

mmd=QuadraticTimeMMD(kernel,features_x, features_y)
mmd=QuadraticTimeMMD(kernel,features, n)

# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05 using different methods to approximate
Expand Down

This file was deleted.

2 changes: 1 addition & 1 deletion src/shogun/features/DataGenerator.h
Expand Up @@ -32,7 +32,7 @@ class CDataGenerator: public CSGObject
*
* @param m number of samples to generate
* @param dim dimension of generated samples
* @param mean_shift is added mean of first dimension
* @param mean_shift is added to mean of first dimension
* @target_data if non-NULL then this is used as matrix data storage. Make
* sure that its dimensions fit
* @return matrix with concatenated samples,first p then q
Expand Down

0 comments on commit 62c71cf

Please sign in to comment.