Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #706 from karlnapf/master
more minor fixes and graphical examples
  • Loading branch information
karlnapf committed Aug 11, 2012
2 parents ae5d490 + f059754 commit b88994d
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 16 deletions.
124 changes: 124 additions & 0 deletions examples/undocumented/python_modular/graphical/statistics_hsic.py
@@ -0,0 +1,124 @@
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from pylab import *
from scipy import *

from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import HSIC
from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA

# parameters, change to get different results
m=250
difference=3

# setting the angle lower makes a harder test
angle=pi/30

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
# Note that kernels per data can be different
kernel_x=GaussianKernel(10,8)
kernel_y=GaussianKernel(10,8)

# create hsic instance. Note that this is a convienience constructor which copies
# feature data. features_x and features_y are not these used in hsic.
# This is only for user-friendlyness. Usually, its ok to do this.
# Below, the alternative distribution is sampled, which means
# that new feature objects have to be created in each iteration (slow)
# However, normally, the alternative distribution is not sampled
hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

# sample alternative distribution
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
features_x.set_feature_matrix(array([data[0]]))
features_y.set_feature_matrix(array([data[1]]))

# re-create hsic instance everytime since feature objects are copied due to
# useage of convienience constructor
hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
alt_samples[i]=hsic.compute_statistic()

# sample from null distribution
# bootstrapping, biased statistic
hsic.set_null_approximation_method(BOOTSTRAP)
hsic.set_bootstrap_iterations(num_null_samples)
null_samples_boot=hsic.bootstrap_null()

# fit gamma distribution, biased statistic
hsic.set_null_approximation_method(HSIC_GAMMA)
gamma_params=hsic.fit_null_gamma()
# sample gamma with parameters
null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])

# plot
figure()

# plot data x and y
subplot(2,2,1)
plot(data[0], data[1], 'o')
title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
xlabel('$x$')
ylabel('$y$')
grid(True)

# compute threshold for test level
alpha=0.05
null_samples_boot.sort()
null_samples_gamma.sort()
thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

# plot alternative distribution with threshold
subplot(2,2,2)
hist(alt_samples, 20, normed=True);
axvline(thresh_boot, 0, 1, linewidth=2, color='red')
type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
grid(True)

# compute range for all null distribution histograms
hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]

# plot null distribution with threshold
subplot(2,2,3)
hist(null_samples_boot, 20, range=hist_range, normed=True);
axvline(thresh_boot, 0, 1, linewidth=2, color='red')
title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot))
grid(True)

# plot null distribution gamma
subplot(2,2,4)
hist(null_samples_gamma, 20, range=hist_range, normed=True);
axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma))
grid(True)

# pull plots a bit apart
subplots_adjust(hspace=0.5)
subplots_adjust(wspace=0.5)
show()
@@ -0,0 +1,122 @@
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from pylab import *
from scipy import *

from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN

# parameters, change to get different results
m=10000
dim=2

# setting the difference of the first dimension smaller makes a harder test
difference=0.4

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)

# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)

# use biased statistic
mmd=LinearTimeMMD(kernel,features, m)

# sample alternative distribution
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
data=DataGenerator.generate_mean_data(m,dim,difference)
features.set_feature_matrix(data)
alt_samples[i]=mmd.compute_statistic()

# sample from null distribution
# bootstrapping, biased statistic
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(num_null_samples)
null_samples_boot=mmd.bootstrap_null()

# fit normal distribution to null and sample a normal distribution
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
variance=mmd.compute_variance_estimate()
null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)

# plot
figure()

# plot data of p and q
subplot(2,3,1)
plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.2)
legend()
title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
xlabel('$x_1, y_1$')
ylabel('$x_2, y_2$')
grid(True)

# histogram of first data dimension and pdf
subplot(2,3,2)
hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
xs=linspace(min(data[0])-1,max(data[0])+1, 50)
plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
title('Data PDF in $x_1$')
grid(True)

# compute threshold for test level
alpha=0.05
null_samples_boot.sort()
null_samples_gaussian.sort()
thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))];

type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples)

# plot alternative distribution with threshold
subplot(2,3,4)
hist(alt_samples, 20, normed=True);
axvline(thresh_boot, 0, 1, linewidth=2, color='red')
type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
grid(True)

# compute range for all null distribution histograms
hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])]

# plot null distribution with threshold
subplot(2,3,3)
hist(null_samples_boot, 20, range=hist_range, normed=True);
axvline(thresh_boot, 0, 1, linewidth=2, color='red')
title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot))
grid(True)

# plot null distribution gaussian
subplot(2,3,5)
hist(null_samples_gaussian, 20, range=hist_range, normed=True);
axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian))
grid(True)

# pull plots a bit apart
subplots_adjust(hspace=0.5)
subplots_adjust(wspace=0.5)
show()
Expand Up @@ -19,10 +19,12 @@
# parameters, change to get different results
m=100
dim=2

# setting the difference of the first dimension smaller makes a harder test
difference=0.5

# number of samples taken from null and alternative distribution
num_bootstrap=500
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)
Expand All @@ -40,7 +42,7 @@
mmd.set_statistic_type(BIASED)

# sample alternative distribution
alt_samples=zeros(num_bootstrap)
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
data=DataGenerator.generate_mean_data(m,dim,difference)
features.set_feature_matrix(data)
Expand All @@ -50,22 +52,22 @@
# bootstrapping, biased statistic
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_statistic_type(BIASED)
mmd.set_bootstrap_iterations(num_bootstrap)
mmd.set_bootstrap_iterations(num_null_samples)
null_samples_boot=mmd.bootstrap_null()

# sample from null distribution
# spectrum, biased statistic
if "sample_null_spectrum" in dir(QuadraticTimeMMD):
mmd.set_null_approximation_method(MMD2_SPECTRUM)
mmd.set_statistic_type(BIASED)
null_samples_spectrum=mmd.sample_null_spectrum(num_bootstrap, m-10)
null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10)

# fit gamma distribution, biased statistic
mmd.set_null_approximation_method(MMD2_GAMMA)
mmd.set_statistic_type(BIASED)
gamma_params=mmd.fit_null_gamma()
# sample gamma with parameters
null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_bootstrap)])
null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])


# plot
Expand All @@ -77,7 +79,7 @@
plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$')
legend()
title('Data')
title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
xlabel('$x_1, y_1$')
ylabel('$x_2, y_2$')
grid(True)
Expand All @@ -89,7 +91,7 @@
xs=linspace(min(data[0])-1,max(data[0])+1, 50)
plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
title('Data: $x_1, y_1$')
title('Data PDF in $x_1$')
grid(True)

# compute threshold for test level
Expand All @@ -101,15 +103,15 @@
thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))];
thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_bootstrap)
type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_bootstrap)
type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_bootstrap)
type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples)
type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

# plot alternative distribution with threshold
subplot(2,3,4)
hist(alt_samples, 20, normed=True);
axvline(thresh_boot, 0, 1, linewidth=2, color='red')
type_two_error=sum(alt_samples<thresh_boot)/float(num_bootstrap)
type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
grid(True)

Expand Down
7 changes: 3 additions & 4 deletions src/shogun/statistics/HSIC.cpp
Expand Up @@ -68,9 +68,6 @@ float64_t CHSIC::compute_statistic()
SGMatrix<float64_t> K=get_kernel_matrix_K();
SGMatrix<float64_t> L=get_kernel_matrix_L();

/* init kernel afterwards to ensure that precomputed stuff is updated */
m_kernel_q->init(m_p_and_q, m_p_and_q);

/* center matrices (MATLAB: Kc=H*K*H) */
K.center();

Expand Down Expand Up @@ -230,6 +227,7 @@ SGMatrix<float64_t> CHSIC::get_kernel_matrix_K()
/* distinguish between custom and normal kernels */
if (m_kernel_p->get_kernel_type()==K_CUSTOM)
{
/* custom kernels need to to be initialised when a subset is added */
CCustomKernel* custom_kernel_p=(CCustomKernel*)m_kernel_p;
custom_kernel_p->add_row_subset(subset);
custom_kernel_p->add_col_subset(subset);
Expand Down Expand Up @@ -263,8 +261,9 @@ SGMatrix<float64_t> CHSIC::get_kernel_matrix_L()
subset.add(m_q_start);

/* now second half of data for L */
if (m_kernel_p->get_kernel_type()==K_CUSTOM)
if (m_kernel_q->get_kernel_type()==K_CUSTOM)
{
/* custom kernels need to to be initialised when a subset is added */
CCustomKernel* custom_kernel_q=(CCustomKernel*)m_kernel_q;
custom_kernel_q->add_row_subset(subset);
custom_kernel_q->add_col_subset(subset);
Expand Down
2 changes: 1 addition & 1 deletion src/shogun/statistics/QuadraticTimeMMD.cpp
Expand Up @@ -274,7 +274,7 @@ SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
/* evtl. warn user not to use wrong statistic type */
if (m_statistic_type!=BIASED)
{
SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
"to be BIASED. Please ensure that! To get rid of warning,"
"call %s::set_statistic_type(BIASED)\n", get_name(),
get_name());
Expand Down

0 comments on commit b88994d

Please sign in to comment.