Skip to content

Commit

Permalink
Merge pull request #622 from karlnapf/master
Browse files Browse the repository at this point in the history
fixes for HSIC and first test/example
  • Loading branch information
karlnapf committed Jul 4, 2012
2 parents caa554c + 5bcd69f commit 2c53078
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 20 deletions.
1 change: 1 addition & 0 deletions examples/undocumented/libshogun/Makefile
Expand Up @@ -85,6 +85,7 @@ TARGETS = basic_minimal \
transfer_multitasklogisticregression \
statistics_quadratic_time_mmd \
statistics_linear_time_mmd \
statistics_hsic \

all: $(TARGETS)

Expand Down
87 changes: 87 additions & 0 deletions examples/undocumented/libshogun/statistics_hsic.cpp
@@ -0,0 +1,87 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2012 Heiko Strathmann
*/

#include <shogun/base/init.h>
#include <shogun/statistics/HSIC.h>
#include <shogun/kernel/GaussianKernel.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/mathematics/Statistics.h>

using namespace shogun;


void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
{
/* create data matrix for P and Q. P is a standard normal, Q is the same but
* has a mean difference in one dimension */
for (index_t i=0; i<target.num_rows; ++i)
{
for (index_t j=0; j<target.num_cols/2; ++j)
target(i,j)=CMath::randn_double();

/* add mean difference in first dimension of second half of data */
for (index_t j=target.num_cols/2; j<target.num_cols; ++j)
target(i,j)=CMath::randn_double() + (i==0 ? difference : 0);
}
}

/** tests the hsic statistic for a single fixed data case and ensures
* equality with matlab implementation */
void test_hsic_fixed()
{
index_t m=2;
index_t d=3;
float64_t sigma_x=2;
float64_t sq_sigma_x_twice=sigma_x*sigma_x*2;
float64_t sigma_y=3;
float64_t sq_sigma_y_twice=sigma_y*sigma_y*2;

SGMatrix<float64_t> p(d,2*m);
for (index_t i=0; i<2*d*m; ++i)
p.matrix[i]=i;

// p.display_matrix("p");

SGMatrix<float64_t> q(d,2*m);
for (index_t i=0; i<2*d*m; ++i)
q.matrix[i]=i+10;

// q.display_matrix("q");

CDenseFeatures<float64_t>* features_p=new CDenseFeatures<float64_t>(p);
CDenseFeatures<float64_t>* features_q=new CDenseFeatures<float64_t>(q);

/* shoguns kernel width is different */
CGaussianKernel* kernel_p=new CGaussianKernel(10, sq_sigma_x_twice);
CGaussianKernel* kernel_q=new CGaussianKernel(10, sq_sigma_y_twice);

CHSIC* hsic=new CHSIC(kernel_p, kernel_q, features_p, features_q);

/* assert matlab result */
float64_t difference=hsic->compute_statistic();
SG_SPRINT("hsic fixed: %f\n", difference);
ASSERT(CMath::abs(difference-0.164761446385339)<10E-17);

SG_UNREF(hsic);
}

int main(int argc, char** argv)
{
init_shogun_with_defaults();

/* all tests have been "speed up" by reducing the number of runs/samples.
* If you have any doubts in the results, set all num_runs to original
* numbers and activate asserts. If they fail, something is wrong.
*/
test_hsic_fixed();

exit_shogun();
return 0;
}

1 change: 1 addition & 0 deletions src/shogun/lib/SGMatrix.cpp
Expand Up @@ -11,6 +11,7 @@ template <class T>
void SGMatrix<T>::transpose_matrix(
T*& matrix, int32_t& num_feat, int32_t& num_vec)
{
/* this should be done in-place! Heiko */
T* transposed=SG_MALLOC(T, num_vec*num_feat);
for (int32_t i=0; i<num_vec; i++)
{
Expand Down
1 change: 1 addition & 0 deletions src/shogun/lib/SGMatrix.h
Expand Up @@ -243,6 +243,7 @@ template<class T> class SGMatrix : public SGReferencedData
return colsums;
}

/** Centers the matrix, i.e. removes column/row mean from columns/rows */
void center()
{
center_matrix(matrix, num_rows, num_cols);
Expand Down
9 changes: 4 additions & 5 deletions src/shogun/statistics/HSIC.cpp
Expand Up @@ -44,7 +44,7 @@ void CHSIC::init()

float64_t CHSIC::compute_statistic()
{
if (!m_kernel_p || m_kernel_q)
if (!m_kernel_p || !m_kernel_q)
{
SG_ERROR("%s::compute_statistic(): No or only one kernel specified!\n",
get_name());
Expand All @@ -55,13 +55,12 @@ float64_t CHSIC::compute_statistic()
m_kernel_q->init(m_q, m_q);

SGMatrix<float64_t> K=m_kernel_p->get_kernel_matrix();
SGMatrix<float64_t> L=m_kernel_p->get_kernel_matrix();
SGMatrix<float64_t> L=m_kernel_q->get_kernel_matrix();

/* center matrices (replaces this H matrix from the paper) */
/* center matrices (MATLAB: Kc=H*K*H) */
K.center();
L.center();

/* compute MATLAB: sum(sum((H*K)' .* (H*L))), which is biased HSIC */
/* compute MATLAB: sum(sum(Kc' .* (L))), which is biased HSIC */
index_t m=K.num_rows;
float64_t result=0;
for (index_t i=0; i<m; ++i)
Expand Down
16 changes: 5 additions & 11 deletions src/shogun/statistics/IndependenceTestStatistic.h
Expand Up @@ -16,19 +16,13 @@ namespace shogun
{

class CFeatures;
/** TODO
*
* @brief Test statistic base class. Provides an interface for statistical
* tests via three methods: compute_statistic(), compute_p_value() and
* compute_threshold(). The second computes a p-value for the statistic computed
* by the first method.
* The p-value represents the position of the statistic in the null-distribution,
* i.e. the distribution of the statistic population given the null-hypothesis
* is true. (1-position = p-value).
* The third method, compute_threshold(), computes a threshold for a given
* test level which is needed to reject the null-hypothesis
/** @brief Independence test base class. Provides an interface for performing an
* independence test, i.e. Given samples from two distributions p_x and p_y, the
* null-hypothesis is: H0: p_xy=p_x*p_y, the alternative hypothesis:
* H1: p_xy!=p_x*p_y
*
* Abstract base class.
*
*/
class CIndependenceTestStatistic : public CTestStatistic
{
Expand Down
4 changes: 0 additions & 4 deletions src/shogun/statistics/TwoSampleTestStatistic.h
Expand Up @@ -21,10 +21,6 @@ class CFeatures;
* two-sample test, i.e. Given samples from two distributions p and q, the
* null-hypothesis is: H0: p==q, the alternative hypothesis: H1: p!=q.
*
* It is possible to define multiple ways to compute the p-value.
*
* Provides code for sampling the null-distribution via bootstrapping.
*
* Abstract base class.
*
*/
Expand Down

0 comments on commit 2c53078

Please sign in to comment.