Skip to content

Commit

Permalink
Merge pull request #613 from karlnapf/master
Browse files Browse the repository at this point in the history
a new python example and some fixes
  • Loading branch information
karlnapf committed Jun 29, 2012
2 parents db38a5d + 1de3fce commit e9b94ce
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 5 deletions.
Expand Up @@ -191,6 +191,12 @@ void test_linear_mmd_type2_error()
for (index_t i=0; i<num_runs; ++i)
{
create_mean_data(data, difference);

/* technically, this leads to a wrong result since training (statistic)
* and testing (p-value) have to happen on different data, but this
* is only to compare against MATLAB, where I did the same "mistake"
* See for example python_modular example how to do this correct
* Note that this is only when using Gaussian approximation */
float64_t statistic=mmd->compute_statistic();

float64_t p_value_est=mmd->compute_p_value(statistic);
Expand Down
16 changes: 14 additions & 2 deletions examples/undocumented/python_modular/statistics_linear_time_mmd.py
Expand Up @@ -15,7 +15,7 @@ def statistics_linear_time_mmd():
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN

import matplotlib.pyplot as plt

Expand Down Expand Up @@ -60,8 +60,20 @@ def statistics_linear_time_mmd():
features_y=RealFeatures(Y)
mmd=LinearTimeMMD(kernel,features_x, features_y)

p_value=mmd.compute_p_value(statistic)
# do the same thing using two different way to approximate null-dstribution
# bootstrapping and gaussian approximation (ony for really large samples)
alpha=0.05

print "computing p-value using bootstrapping"
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(500)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

print "computing p-value using gaussian approximation"
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

Expand Down
105 changes: 105 additions & 0 deletions examples/undocumented/python_modular/statistics_quadratic_time_mmd.py
@@ -0,0 +1,105 @@
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from tools.two_distributions_data import TwoDistributionsData

gen_data=TwoDistributionsData()

def statistics_linear_time_mmd():
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Statistics import QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED

import matplotlib.pyplot as plt

# note that the quadratic time mmd has sometimes to store kernel matrices
# which upper bounds the sample size massively
n=500
dim=2
difference=0.5

# data is standard normal distributed. only one dimension of Y has a mean
# shift of difference
(X,Y)=gen_data.create_mean_data(n,dim,difference)

print "dimension means of X", [mean(x) for x in X]
print "dimension means of Y", [mean(x) for x in Y]

# create shogun feature representation
features_x=RealFeatures(X)
features_y=RealFeatures(Y)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)

mmd=QuadraticTimeMMD(kernel,features_x, features_y)

# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05 using different methods to approximate
# null-distribution
statistic=mmd.compute_statistic()
alpha=0.05

print "computing p-value using bootstrapping"
mmd.set_null_approximation_method(BOOTSTRAP)
# normally, at least 250 iterations should be done, but that takes long
mmd.set_bootstrap_iterations(10)
# bootstrapping allows usage of unbiased or biased statistic
mmd.set_statistic_type(UNBIASED)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

print "computing p-value using spectrum method"
mmd.set_null_approximation_method(MMD2_SPECTRUM)
# normally, at least 250 iterations should be done, but that takes long
mmd.set_num_samples_sepctrum(50)
mmd.set_num_eigenvalues_spectrum(n-10)
# spectrum method computes p-value for biased statistics only
mmd.set_statistic_type(BIASED)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

print "computing p-value using gamma method"
mmd.set_null_approximation_method(MMD2_GAMMA)
# gamma method computes p-value for biased statistics only
mmd.set_statistic_type(BIASED)
p_value=mmd.compute_p_value(statistic)
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
# bootstrapping, biased statistic
print "sampling null distribution using bootstrapping"
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_statistic_type(BIASED)
mmd.set_bootstrap_iterations(10)
null_samples=mmd.bootstrap_null()
print "null mean:", mean(null_samples)
print "null variance:", var(null_samples)

# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
# spectrum, biased statistic
print "sampling null distribution using spectrum method"
mmd.set_null_approximation_method(MMD2_SPECTRUM)
mmd.set_statistic_type(BIASED)
# 200 samples using 100 eigenvalues
null_samples=mmd.sample_null_spectrum(200,100)
print "null mean:", mean(null_samples)
print "null variance:", var(null_samples)

if __name__=='__main__':
print('LinearTimeMMD')
statistics_linear_time_mmd()
3 changes: 2 additions & 1 deletion src/shogun/statistics/LinearTimeMMD.h
Expand Up @@ -76,7 +76,8 @@ class CLinearTimeMMD: public CKernelTwoSampleTestStatistic
/** Computes the p-value for a given statistic. The method for computing
* the p-value can be set via set_p_value_method() method. Since the null-
* distribution is normal, a Gaussian approximation is available along with
* bootstrapping
* bootstrapping. For Gaussian approximation, training and test data have
* to be DIFFERENT samples from same distribution
*
* @param statistic statistic to compute the p-value for
*
Expand Down
4 changes: 2 additions & 2 deletions src/shogun/statistics/QuadraticTimeMMD.h
Expand Up @@ -159,15 +159,15 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
float64_t compute_p_value_gamma(float64_t statistic);

/** setter for number of samples to use in spectrum based p-value
* computation
* computation.
*
* @param num_samples_spectrum number of samples to draw from
* approximate null-distributrion
*/
void set_num_samples_sepctrum(index_t num_samples_spectrum);

/** setter for number of eigenvalues to use in spectrum based p-value
* computation
* computation. Maximum is 2*m_q_start-1
*
* @param num_eigenvalues_spectrum number of eigenvalues to use to
* approximate null-distributrion
Expand Down

0 comments on commit e9b94ce

Please sign in to comment.