Skip to content

Commit

Permalink
Merge pull request #674 from karlnapf/master
Browse files Browse the repository at this point in the history
refactoring and bootstrapping for independence tests
  • Loading branch information
karlnapf committed Jul 25, 2012
2 parents 8e7d14c + fac0f66 commit f3b1c98
Show file tree
Hide file tree
Showing 15 changed files with 163 additions and 213 deletions.
Expand Up @@ -8,15 +8,15 @@
#
from numpy import *

def statistics_linear_time_mmd():
def statistics_quadratic_time_mmd():
from shogun.Features import RealFeatures
from shogun.Features import DataGenerator
from shogun.Kernel import GaussianKernel
from shogun.Statistics import QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED

# note that the quadratic time mmd has sometimes to store kernel matrices
# which upper bounds the sample size massively
# note that the quadratic time mmd has to store kernel matrices
# which upper bounds the sample size
n=500
dim=2
difference=0.5
Expand Down Expand Up @@ -97,5 +97,5 @@ def statistics_linear_time_mmd():
print "null variance:", var(null_samples)

if __name__=='__main__':
print('LinearTimeMMD')
statistics_linear_time_mmd()
print('QuadraticTimeMMD')
statistics_quadratic_time_mmd()
6 changes: 2 additions & 4 deletions src/interfaces/modular/Statistics.i
Expand Up @@ -9,20 +9,18 @@

/* Remove C Prefix */
%rename(TestStatistic) CTestStatistic;
%rename(TwoSampleTestStatistic) CTwoSampleTestStatistic;
%rename(TwoDistributionsTestStatistic) CTwoDistributionsTestStatistic;
%rename(KernelTwoSampleTestStatistic) CKernelTwoSampleTestStatistic;
%rename(LinearTimeMMD) CLinearTimeMMD;
%rename(QuadraticTimeMMD) CQuadraticTimeMMD;
%rename(IndependenceTestStatistic) CIndependenceTestStatistic;
%rename(KernelIndependenceTestStatistic) CKernelIndependenceTestStatistic;
%rename(HSIC) CHSIC;

/* Include Class Headers to make them visible from within the target language */
%include <shogun/statistics/TestStatistic.h>
%include <shogun/statistics/TwoSampleTestStatistic.h>
%include <shogun/statistics/TwoDistributionsTestStatistic.h>
%include <shogun/statistics/KernelTwoSampleTestStatistic.h>
%include <shogun/statistics/LinearTimeMMD.h>
%include <shogun/statistics/QuadraticTimeMMD.h>
%include <shogun/statistics/IndependenceTestStatistic.h>
%include <shogun/statistics/KernelIndependenceTestStatistic.h>
%include <shogun/statistics/HSIC.h>
3 changes: 1 addition & 2 deletions src/interfaces/modular/Statistics_includes.i
@@ -1,10 +1,9 @@
%{
#include <shogun/statistics/TestStatistic.h>
#include <shogun/statistics/TwoSampleTestStatistic.h>
#include <shogun/statistics/TwoDistributionsTestStatistic.h>
#include <shogun/statistics/KernelTwoSampleTestStatistic.h>
#include <shogun/statistics/LinearTimeMMD.h>
#include <shogun/statistics/QuadraticTimeMMD.h>
#include <shogun/statistics/IndependenceTestStatistic.h>
#include <shogun/statistics/KernelIndependenceTestStatistic.h>
#include <shogun/statistics/HSIC.h>
%}
Expand Down
8 changes: 8 additions & 0 deletions src/shogun/lib/SGVector.cpp
Expand Up @@ -111,6 +111,14 @@ template<class T> void SGVector<T>::add(const SGVector<T> x)
vector[i]+=x.vector[i];
}

template<class T> void SGVector<T>::add(const T x)
{
ASSERT(vector);

for (int32_t i=0; i<vlen; i++)
vector[i]+=x;
}

template<class T> void SGVector<T>::display_size() const
{
SG_SPRINT("SGVector '%p' of size: %d\n", vector, vlen);
Expand Down
6 changes: 6 additions & 0 deletions src/shogun/lib/SGVector.h
Expand Up @@ -144,6 +144,12 @@ template<class T> class SGVector : public SGReferencedData
*/
void add(const SGVector<T> x);

/** add scalar to current vector
*
* @param x add vector x to current vector
*/
void add(const T x);

SGVector<T> operator+ (SGVector<T> x)
{
ASSERT(x.vector && vector);
Expand Down
52 changes: 40 additions & 12 deletions src/shogun/statistics/HSIC.cpp
Expand Up @@ -20,7 +20,21 @@ CHSIC::CHSIC() :
init();
}

CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q) :
CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p_and_q,
index_t q_start) :
CKernelIndependenceTestStatistic(kernel_p, kernel_q, m_p_and_q, q_start)
{
if (p_and_q && p_and_q->get_num_vectors()/2!=q_start)
{
SG_ERROR("%s: Only features with equal number of vectors are currently "
"possible\n", get_name());
}

init();
}

CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p,
CFeatures* q) :
CKernelIndependenceTestStatistic(kernel_p, kernel_q, p, q)
{
if (p && q && p->get_num_vectors()!=q->get_num_vectors())
Expand Down Expand Up @@ -50,18 +64,25 @@ float64_t CHSIC::compute_statistic()
get_name());
}

/* compute kernel matrices (these have to be stored unfortunately) */
m_kernel_p->init(m_p, m_p);
m_kernel_q->init(m_q, m_q);

/* compute kernel matrices make sure that if one kernel is used, still
* everything works, so compute one after another, using subsets */
SGVector<index_t> subset(m_q_start);
subset.range_fill();
m_p_and_q->add_subset(subset);
m_kernel_p->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> K=m_kernel_p->get_kernel_matrix();

/* now second half of data subsetting */
subset.add(m_q_start);
m_kernel_q->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> L=m_kernel_q->get_kernel_matrix();
m_p_and_q->remove_subset();

/* center matrices (MATLAB: Kc=H*K*H) */
K.center();

/* compute MATLAB: sum(sum(Kc' .* (L))), which is biased HSIC */
index_t m=K.num_rows;
index_t m=m_q_start;
float64_t result=0;
for (index_t i=0; i<m; ++i)
{
Expand All @@ -85,7 +106,7 @@ float64_t CHSIC::compute_p_value(float64_t statistic)
break;

default:
result=CIndependenceTestStatistic::compute_p_value(statistic);
result=CTwoDistributionsTestStatistic::compute_p_value(statistic);
break;
}

Expand All @@ -106,14 +127,21 @@ float64_t CHSIC::compute_p_value_gamma(float64_t statistic)
get_name());
}

index_t m=m_p->get_num_vectors();

/* compute kernel matrices (these have to be stored unfortunately) */
m_kernel_p->init(m_p, m_p);
m_kernel_q->init(m_q, m_q);
index_t m=m_q_start;

/* compute kernel matrices make sure that if one kernel is used, still
* everything works, so compute one after another, using subsets */
SGVector<index_t> subset(m_q_start);
subset.range_fill();
m_p_and_q->add_subset(subset);
m_kernel_p->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> K=m_kernel_p->get_kernel_matrix();

/* now second half of data subsetting */
subset.add(m_q_start);
m_kernel_q->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> L=m_kernel_q->get_kernel_matrix();
m_p_and_q->remove_subset();

/* compute sum and trace of uncentered kernel matrices, needed later */
float64_t trace_K=0;
Expand Down
27 changes: 23 additions & 4 deletions src/shogun/statistics/HSIC.h
Expand Up @@ -24,12 +24,31 @@ class CHSIC : public CKernelIndependenceTestStatistic
/** TODO */
CHSIC();

/** Constructor
*
* @param p_and_q feature data. Is assumed to contain samples from both
* p and q. First all samples from p, then from index q_start all
* samples from q
*
* @param kernel_p kernel to use on samples from p
* @param kernel_q kernel to use on samples from q
* @param p_and_q samples from p and q, appended
* @param q_start index of first sample of q
*/
CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p_and_q,
index_t q_start);

/** Constructor.
* This is a convienience constructor which copies both features to one
* element and then calls the other constructor. Needs twice the memory
* for a short time
*
* @param kernel_p kernel samples from p
* @param kernel_q kernel samples from q
* @param p samples from p
* @param q samples from q
* @param kernel_p kernel to use on samples from p
* @param kernel_q kernel to use on samples from q
* @param p samples from distribution p, will be copied and NOT
* SG_REF'ed
* @param q samples from distribution q, will be copied and NOT
* SG_REF'ed
*/
CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q);

Expand Down
81 changes: 0 additions & 81 deletions src/shogun/statistics/IndependenceTestStatistic.cpp

This file was deleted.

67 changes: 0 additions & 67 deletions src/shogun/statistics/IndependenceTestStatistic.h

This file was deleted.

0 comments on commit f3b1c98

Please sign in to comment.