Skip to content

Commit

Permalink
-finishing touches to the spectrum based null-distribution sampling
Browse files Browse the repository at this point in the history
(tested against MATLAB implementation)
  • Loading branch information
karlnapf committed Jun 4, 2012
1 parent ef62fca commit f6951d0
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 9 deletions.
38 changes: 31 additions & 7 deletions src/shogun/statistics/QuadraticTimeMMD.cpp
Expand Up @@ -106,31 +106,55 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
return result;
}

SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples)
SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
index_t num_eigenvalues)
{
/* the whole procedure is already checked against MATLAB implementation */

if (m_q_start!=m_p_and_q->get_num_vectors()/2)
{
/* TODO support different numbers of samples */
SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
"sample sizes are supported\n", get_name());
}

if (num_eigenvalues>2*m_q_start-1)
{
SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
get_name());
}

/* 2m-1 is the maximum number of used eigenvalues */
if (num_eigenvalues==-1)
num_eigenvalues=2*m_q_start-1;

/* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
* K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
* works since X and Y are concatenated here */
m_kernel->init(m_p_and_q, m_p_and_q);
SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();

/* center matrix K=H*K*H */
CMath::center_matrix(K.matrix, K.num_rows, K.num_cols);

/* compute eigenvalues */
/* compute eigenvalues and select num_eigenvalues largest ones */
SGVector<float64_t> eigenvalues=CMath::compute_eigenvectors(K);
SGVector<float64_t> largest_ev(num_eigenvalues);

/* scale by 1/2/m abs take abs value */
for (index_t i=0; i< eigenvalues.vlen; ++i)
eigenvalues[i]=CMath::abs(eigenvalues[i])*0.5/m_q_start;
/* scale by 1/2/m on the fly and take abs value*/
for (index_t i=0; i<num_eigenvalues; ++i)
largest_ev[i]=CMath::abs(1.0/2/m_q_start*eigenvalues[eigenvalues.vlen-1-i]);

/* finally, sample from null distribution */
SGVector<float64_t> null_samples(num_samples);
for (index_t i=0; i<num_samples; ++i)
{
/* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
null_samples[i]=0;
for (index_t j=0; j<eigenvalues.vlen; ++j)
null_samples[i]+=2*eigenvalues[j]*CMath::pow(CMath::randn_double(), 2);
for (index_t j=0; j<largest_ev.vlen; ++j)
null_samples[i]+=largest_ev[j]*CMath::pow(2.0, 2);

null_samples[i]*=2;
}

return null_samples;
Expand Down
19 changes: 17 additions & 2 deletions src/shogun/statistics/QuadraticTimeMMD.h
Expand Up @@ -34,8 +34,23 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic
return "QuadraticTimeMMD";
};

protected:
SGVector<float64_t> sample_null_spectrum(index_t num_samples);
/* returns a set of samples of an estimate of the null distribution
* using the Eigen-spectrum of the centered kernel matrix of the merged
* samples of p and q.
*
* See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011).
* A fast, consistent kernel two-sample test.
*
* @param num_samples number of samples to draw
* @param num_eigenvalues number of eigenvalues to use to draw samples
* Maximum number of 2m-1 where m is the size of both sets of samples.
* It is usually safe to use a smaller number since they decay very
* fast, however, a conservative approach would be to use all (-1 does
* this). See paper for details.
* @return samples from the estimated null distribution
*/
SGVector<float64_t> sample_null_spectrum(index_t num_samples,
index_t num_eigenvalues=-1);

private:
void init();
Expand Down

0 comments on commit f6951d0

Please sign in to comment.