Merge pull request #631 from karlnapf/master

some fixes and a test for HSIC gamma
shogun-toolbox · Jul 8, 2012 · f842ad6 · f842ad6
2 parents 28c06ce + 564e0bb
commit f842ad6
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 30 deletions.
diff --git a/examples/undocumented/libshogun/statistics_hsic.cpp b/examples/undocumented/libshogun/statistics_hsic.cpp
@@ -31,7 +31,7 @@ void create_mean_data(SGMatrix<float64_t> target, float64_t difference)
 	}
 }
 
-void create_fixed_data_kernel(CFeatures*& features_p,
+void create_fixed_data_kernel_small(CFeatures*& features_p,
 		CFeatures*& features_q, CKernel*& kernel_p, CKernel*& kernel_q)
 {
 	index_t m=2;
@@ -41,13 +41,44 @@ void create_fixed_data_kernel(CFeatures*& features_p,
 	for (index_t i=0; i<2*d*m; ++i)
 		p.matrix[i]=i;
 
-	p.display_matrix("p");
+//	p.display_matrix("p");
 
 	SGMatrix<float64_t> q(d,2*m);
 	for (index_t i=0; i<2*d*m; ++i)
 		q.matrix[i]=i+10;
 
-	q.display_matrix("q");
+//	q.display_matrix("q");
+
+	features_p=new CDenseFeatures<float64_t>(p);
+	features_q=new CDenseFeatures<float64_t>(q);
+
+	float64_t sigma_x=2;
+	float64_t sigma_y=3;
+	float64_t sq_sigma_x_twice=sigma_x*sigma_x*2;
+	float64_t sq_sigma_y_twice=sigma_y*sigma_y*2;
+
+	/* shoguns kernel width is different */
+	kernel_p=new CGaussianKernel(10, sq_sigma_x_twice);
+	kernel_q=new CGaussianKernel(10, sq_sigma_y_twice);
+}
+
+void create_fixed_data_kernel_big(CFeatures*& features_p,
+		CFeatures*& features_q, CKernel*& kernel_p, CKernel*& kernel_q)
+{
+	index_t m=10;
+	index_t d=7;
+
+	SGMatrix<float64_t> p(d,m);
+	for (index_t i=0; i<d*m; ++i)
+		p.matrix[i]=(i+8)%3;
+
+//	p.display_matrix("p");
+
+	SGMatrix<float64_t> q(d,m);
+	for (index_t i=0; i<d*m; ++i)
+		q.matrix[i]=((i+10)*(i%4+2))%4;
+
+//	q.display_matrix("q");
 
 	features_p=new CDenseFeatures<float64_t>(p);
 	features_q=new CDenseFeatures<float64_t>(q);
@@ -63,21 +94,23 @@ void create_fixed_data_kernel(CFeatures*& features_p,
 }
 
 /** tests the hsic statistic for a single fixed data case and ensures
- * equality with matlab implementation */
+ * equality with sma implementation */
 void test_hsic_fixed()
 {
 	CFeatures* features_p=NULL;
 	CFeatures* features_q=NULL;
 	CKernel* kernel_p=NULL;
 	CKernel* kernel_q=NULL;
-	create_fixed_data_kernel(features_p, features_q, kernel_p, kernel_q);
+	create_fixed_data_kernel_small(features_p, features_q, kernel_p, kernel_q);
+
+	index_t m=features_p->get_num_vectors();
 
 	CHSIC* hsic=new CHSIC(kernel_p, kernel_q, features_p, features_q);
 
-	/* assert matlab result */
+	/* assert matlab result, note that compute statistic computes m*hsic */
 	float64_t difference=hsic->compute_statistic();
 	SG_SPRINT("hsic fixed: %f\n", difference);
-	ASSERT(CMath::abs(difference-0.164761446385339)<10E-17);
+	ASSERT(CMath::abs(difference-m*0.164761446385339)<10E-16);
 
 	SG_UNREF(hsic);
 }
@@ -88,12 +121,14 @@ void test_hsic_gamma()
 	CFeatures* features_q=NULL;
 	CKernel* kernel_p=NULL;
 	CKernel* kernel_q=NULL;
-	create_fixed_data_kernel(features_p, features_q, kernel_p, kernel_q);
+	create_fixed_data_kernel_big(features_p, features_q, kernel_p, kernel_q);
 
 	CHSIC* hsic=new CHSIC(kernel_p, kernel_q, features_p, features_q);
 	hsic->set_null_approximation_method(HSIC_GAMMA);
 
-	hsic->compute_p_value(1);
+	float64_t p=hsic->compute_p_value(0.05);
+	SG_SPRINT("p-value: %f\n", p);
+	ASSERT(CMath::abs(p-0.172182287884256)<10E-15);
 
 	SG_UNREF(hsic);
 }

diff --git a/src/shogun/statistics/HSIC.cpp b/src/shogun/statistics/HSIC.cpp
@@ -69,7 +69,8 @@ float64_t CHSIC::compute_statistic()
 			result+=K(j, i)*L(i, j);
 	}
 
-	result/=m*m;
+	/* return m times statistic */
+	result/=m;
 
 	return result;
 }
@@ -107,23 +108,13 @@ float64_t CHSIC::compute_p_value_gamma(float64_t statistic)
 
 	index_t m=m_p->get_num_vectors();
 
-	/* NOTE: the gamma distribution is fitted to m*HSIC_b. Therefore, the
-	 * parameter statistic value is multiplied by m before anything is done.
-	 * This assumes that the feature data size is NOT changed after statistics
-	 * call */
-	statistic*=m;
-	SG_WARNING("check this! Gamma test uses different statistic!\n");
-
 	/* compute kernel matrices (these have to be stored unfortunately) */
 	m_kernel_p->init(m_p, m_p);
 	m_kernel_q->init(m_q, m_q);
 
 	SGMatrix<float64_t> K=m_kernel_p->get_kernel_matrix();
 	SGMatrix<float64_t> L=m_kernel_q->get_kernel_matrix();
 
-	K.display_matrix("K");
-	L.display_matrix("L");
-
 	/* compute sum and trace of uncentered kernel matrices, needed later */
 	float64_t trace_K=0;
 	float64_t trace_L=0;
@@ -146,9 +137,6 @@ float64_t CHSIC::compute_p_value_gamma(float64_t statistic)
 	K.center();
 	L.center();
 
-	K.display_matrix("K_centered");
-	L.display_matrix("L_centered");
-
 	/* compute the trace of MATLAB: (1/6 * Kc.*Lc).^2 Ü */
 	float64_t trace=0;
 	for (index_t i=0; i<m; ++i)
@@ -174,7 +162,7 @@ float64_t CHSIC::compute_p_value_gamma(float64_t statistic)
 
 	/* finally, compute variance of hsic under H0
 	 * MATLAB: varHSIC = 72*(m-4)*(m-5)/m/(m-1)/(m-2)/(m-3)  *  varHSIC */
-	var_hsic=72.0*(m-4)*(m-5)/m/(m-1)/(m-2)/(m-2)*var_hsic;
+	var_hsic=72.0*(m-4)*(m-5)/m/(m-1)/(m-2)/(m-3)*var_hsic;
 	SG_DEBUG("var_hsic: %f\n", var_hsic);
 
 	/* compute mean of matrices with diagonal elements zero on the base of sums
@@ -195,7 +183,7 @@ float64_t CHSIC::compute_p_value_gamma(float64_t statistic)
 	/* return: cdf('gam',statistic,al,bet) (MATLAB)
 	 * which will get the position in the null distribution */
 	float64_t result=CStatistics::gamma_cdf(statistic, a, b);
-	SG_DEBUG("result: %f\n", result);
+	SG_DEBUG("gamma_cdf(%f, %f, %f): %f\n", statistic, a, b, result);
 
 	SG_DEBUG(("leaving %s::compute_p_value_gamma()\n"), get_name());
 	return result;

diff --git a/src/shogun/statistics/HSIC.h b/src/shogun/statistics/HSIC.h
@@ -35,7 +35,7 @@ class CHSIC : public CKernelIndependenceTestStatistic
 
 	virtual ~CHSIC();
 
-	/** Computes the biased HSIC TODO */
+	/** Computes the biased m*HSIC TODO */
 	virtual float64_t compute_statistic();
 
 	/** computes a p-value based on current method for approximating the

diff --git a/src/shogun/statistics/IndependenceTestStatistic.cpp b/src/shogun/statistics/IndependenceTestStatistic.cpp
@@ -72,8 +72,8 @@ float64_t CIndependenceTestStatistic::compute_p_value(float64_t statistic)
 	}
 	else
 	{
-		SG_ERROR("%s::compute_p_value(): Unknown method to approximate null-"
-				"distirbution!\n", get_name());
+		SG_ERROR("%CIndependenceTestStatistic::compute_p_value(): Unknown "
+				"method to approximate null-distirbution!\n");
 	}
 
 	return result;

diff --git a/src/shogun/statistics/TwoSampleTestStatistic.cpp b/src/shogun/statistics/TwoSampleTestStatistic.cpp
@@ -103,8 +103,8 @@ float64_t CTwoSampleTestStatistic::compute_p_value(float64_t statistic)
 	}
 	else
 	{
-		SG_ERROR("%s::compute_p_value(): Unknown method to approximate null-"
-				"distribution!\n", get_name());
+		SG_ERROR("CTwoSampleTestStatistics::compute_p_value(): Unknown method "
+				"to approximate null-distribution!\n");
 	}
 
 	return result;