Skip to content

Commit

Permalink
Merge pull request #610 from karlnapf/master
Browse files Browse the repository at this point in the history
fixes, documentation and example
  • Loading branch information
karlnapf committed Jun 29, 2012
2 parents 047feef + 680f6ca commit ccf6742
Show file tree
Hide file tree
Showing 14 changed files with 130 additions and 162 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -88,5 +88,6 @@ configure.log
/examples/undocumented/libshogun/*
!/examples/undocumented/libshogun/*.cpp
examples/undocumented/python_modular/*
!/examples/undocumented/libshogun/tools/
!examples/undocumented/python_modular/graphical/
!examples/undocumented/python_modular/*.py
77 changes: 77 additions & 0 deletions examples/undocumented/python_modular/statistics_linear_time_mmd.py
@@ -0,0 +1,77 @@
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012 Heiko Strathmann
#
from numpy import *
from tools.two_distributions_data import TwoDistributionsData

gen_data=TwoDistributionsData()

def statistics_linear_time_mmd():
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD, QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP

import matplotlib.pyplot as plt

# note that the linear time statistic is designed for much larger datasets
n=10000
dim=2
difference=0.5

# data is standard normal distributed. only one dimension of Y has a mean
# shift of difference
# in pratice, this generate data function could be replaced by a method
# that obtains data from a stream
(X,Y)=gen_data.create_mean_data(n,dim,difference)

print "dimension means of X", [mean(x) for x in X]
print "dimension means of Y", [mean(x) for x in Y]

# create shogun feature representation
features_x=RealFeatures(X)
features_y=RealFeatures(Y)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,0.125)

mmd=LinearTimeMMD(kernel,features_x, features_y)

# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05
# for the linear time mmd, the statistic has to be computed on different
# data than the p-value, so first, compute statistic, and then compute
# p-value on other data
# this annoying property is since the null-distribution should stay normal
# which is not the case if "training/test" data would be the same
statistic=mmd.compute_statistic()

# generate new data (same distributions as old) and new statistic object
(X,Y)=gen_data.create_mean_data(n,dim,difference)
features_x=RealFeatures(X)
features_y=RealFeatures(Y)
mmd=LinearTimeMMD(kernel,features_x, features_y)

p_value=mmd.compute_p_value(statistic)
alpha=0.05
print "p_value:", p_value
print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha

# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(100)
null_samples=mmd.bootstrap_null()
print "null mean:", mean(null_samples)
print "null variance:", var(null_samples)

if __name__=='__main__':
print('LinearTimeMMD')
statistics_linear_time_mmd()
@@ -0,0 +1,20 @@
#!/usr/bin/env python

from numpy import *

# class for creating pairs of sample sets from two different distribution
class TwoDistributionsData:
def __init__(self):
pass

# creates to sample sets of desired shape which is standard normal distrbuted
# the first dimension of Y has a mean shift of difference
def create_mean_data(self, n, dim, difference):
X=random.randn(n,dim)
Y=random.randn(n,dim)

for i in range(len(Y)):
Y[i][0]+=difference

# return in shogun format: vectors are columns
return (transpose(X),transpose(Y))
2 changes: 0 additions & 2 deletions src/interfaces/modular/Statistics.i
Expand Up @@ -8,15 +8,13 @@
*/

/* Remove C Prefix */
%rename(StatisticalTest) CStatisticalTest;
%rename(TestStatistic) CTestStatistic;
%rename(TwoSampleTestStatistic) CTwoSampleTestStatistic;
%rename(KernelTwoSampleTestStatistic) CKernelTwoSampleTestStatistic;
%rename(LinearTimeMMD) CLinearTimeMMD;
%rename(QuadraticTimeMMD) CQuadraticTimeMMD;

/* Include Class Headers to make them visible from within the target language */
%include <shogun/statistics/StatisticalTest.h>
%include <shogun/statistics/TestStatistic.h>
%include <shogun/statistics/TwoSampleTestStatistic.h>
%include <shogun/statistics/KernelTwoSampleTestStatistic.h>
Expand Down
1 change: 0 additions & 1 deletion src/interfaces/modular/Statistics_includes.i
@@ -1,5 +1,4 @@
%{
#include <shogun/statistics/StatisticalTest.h>
#include <shogun/statistics/TestStatistic.h>
#include <shogun/statistics/TwoSampleTestStatistic.h>
#include <shogun/statistics/KernelTwoSampleTestStatistic.h>
Expand Down
2 changes: 1 addition & 1 deletion src/shogun/features/Features.h
Expand Up @@ -231,7 +231,7 @@ class CFeatures : public CSGObject
* @return new feature object which contains copy of data of this
* instance and of given one
*/
CFeatures* create_merged_copy(CFeatures* other)
virtual CFeatures* create_merged_copy(CFeatures* other)
{
SG_ERROR("%s::create_merged_copy() is not yet implemented!\n");
return NULL;
Expand Down
8 changes: 7 additions & 1 deletion src/shogun/statistics/LinearTimeMMD.cpp
Expand Up @@ -56,13 +56,16 @@ void CLinearTimeMMD::init()

float64_t CLinearTimeMMD::compute_statistic()
{
SG_DEBUG("entering CLinearTimeMMD::compute_statistic()\n");
/* TODO features with a different number of vectors should be allowed */

/* m is number of samples from each distribution, m_2 is half of it
* using names from JLMR paper (see class documentation) */
index_t m=m_q_start;
index_t m_2=m/2;

SG_DEBUG("m_q_start=%d\n", m_q_start);

/* compute traces of kernel matrices for linear MMD */
m_kernel->init(m_p_and_q, m_p_and_q);

Expand All @@ -80,15 +83,18 @@ float64_t CLinearTimeMMD::compute_statistic()
qp+=m_kernel->kernel(m_2+i, m+i);
}

SG_DEBUG("returning: 1/%d*(%f+%f-%f-%f)\n", m_2, pp, qq, pq, qp);

/* mean of sum all traces is linear time mmd */
SG_DEBUG("leaving CLinearTimeMMD::compute_statistic()\n");
return 1.0/m_2*(pp+qq-pq-qp);
}

float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
{
float64_t result=0;

switch (m_p_value_method)
switch (m_null_approximation_method)
{
case MMD1_GAUSSIAN:
if (m_p_and_q->get_num_vectors()<10000)
Expand Down
7 changes: 6 additions & 1 deletion src/shogun/statistics/LinearTimeMMD.h
Expand Up @@ -27,7 +27,12 @@ class CFeatures;
* Gaussian approximation of the null-distribution which is also possible in
* linear time and constant space. Bootstrapping, of course, is also possible.
*
* To choose, use set_p_value_method(MMD1_GAUSSIAN). *
* To choose, use
* CTwoSampleTestStatistic::set_null_approximation_method(MMD1_GAUSSIAN).
*
* IMPORTANT: In order to use the gaussian approximation, the p-value has to
* be computed on other data than the statistic. Otherwise the null-distribution
* is not normal.
*/
class CLinearTimeMMD: public CKernelTwoSampleTestStatistic
{
Expand Down
2 changes: 1 addition & 1 deletion src/shogun/statistics/QuadraticTimeMMD.cpp
Expand Up @@ -167,7 +167,7 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
{
float64_t result=0;

switch (m_p_value_method)
switch (m_null_approximation_method)
{
case MMD2_SPECTRUM:
{
Expand Down
2 changes: 1 addition & 1 deletion src/shogun/statistics/QuadraticTimeMMD.h
Expand Up @@ -53,7 +53,7 @@ enum EQuadraticMMDType
* Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
* A fast, consistent kernel two-sample test.
*
* To choose, use CTwoSampleTestStatistic::set_p_value_method()
* To choose, use CTwoSampleTestStatistic::set_null_approximation_method()
*
*/
class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
Expand Down
76 changes: 0 additions & 76 deletions src/shogun/statistics/StatisticalTest.cpp

This file was deleted.

64 changes: 0 additions & 64 deletions src/shogun/statistics/StatisticalTest.h

This file was deleted.

15 changes: 9 additions & 6 deletions src/shogun/statistics/TwoSampleTestStatistic.cpp
Expand Up @@ -52,18 +52,21 @@ void CTwoSampleTestStatistic::init()
MS_NOT_AVAILABLE);
SG_ADD(&m_bootstrap_iterations, "bootstrap_iterations",
"Number of iterations for bootstrapping", MS_NOT_AVAILABLE);
SG_ADD((machine_int_t*)&m_p_value_method, "p_value_method",
"Method for computing p-value", MS_NOT_AVAILABLE);
SG_ADD((machine_int_t*)&m_null_approximation_method,
"null_approximation_method",
"Method for approximating null distribution",
MS_NOT_AVAILABLE);

m_p_and_q=NULL;
m_q_start=0;
m_bootstrap_iterations=250;
m_p_value_method=BOOTSTRAP;
m_null_approximation_method=BOOTSTRAP;
}

void CTwoSampleTestStatistic::set_p_value_method(EPValueMethod p_value_method)
void CTwoSampleTestStatistic::set_null_approximation_method(
ENullApproximationMethod null_approximation_method)
{
m_p_value_method=p_value_method;
m_null_approximation_method=null_approximation_method;
}

SGVector<float64_t> CTwoSampleTestStatistic::bootstrap_null()
Expand Down Expand Up @@ -105,7 +108,7 @@ float64_t CTwoSampleTestStatistic::compute_p_value(float64_t statistic)
{
float64_t result=0;

if (m_p_value_method==BOOTSTRAP)
if (m_null_approximation_method==BOOTSTRAP)
{
/* bootstrap a bunch of MMD values from null distribution */
SGVector<float64_t> values=bootstrap_null();
Expand Down

0 comments on commit ccf6742

Please sign in to comment.