Skip to content

Commit

Permalink
Merge pull request #760 from karlnapf/master
Browse files Browse the repository at this point in the history
added a very basic kernel selection method based on median distances
  • Loading branch information
karlnapf committed Aug 28, 2012
2 parents b630803 + dd46c66 commit 3ec005c
Show file tree
Hide file tree
Showing 13 changed files with 390 additions and 28 deletions.
23 changes: 23 additions & 0 deletions examples/undocumented/libshogun/statistics.cpp
Expand Up @@ -50,6 +50,28 @@ void test_mean()
ASSERT(mean2==7);
}

void test_median()
{
SGMatrix<float64_t> X(3,5);
SGVector<float64_t> Y(X.num_rows*X.num_cols);
for (index_t i=0; i<X.num_rows*X.num_cols; ++i)
{
X.matrix[i]=CMath::random(0, 15);
Y[i]=X.matrix[i];
}
X.display_matrix("X");
Y.display_vector("Y");

/* test all median computation method on vector and matrix */
float64_t median=CStatistics::median(Y, false, false);
ASSERT(median==CStatistics::median(Y, false, true));
ASSERT(median==CStatistics::median(Y, true));

ASSERT(median==CStatistics::matrix_median(X, false, false));
ASSERT(median==CStatistics::matrix_median(X, false, true));
ASSERT(median==CStatistics::matrix_median(X, true));
}

void test_variance()
{
SGMatrix<float64_t> X(3,5);
Expand Down Expand Up @@ -311,6 +333,7 @@ int main(int argc, char **argv)
init_shogun_with_defaults();

test_mean();
test_median();
test_variance();
test_confidence_intervals();
test_inverse_student_t();
Expand Down
31 changes: 25 additions & 6 deletions examples/undocumented/python_modular/graphical/statistics_hsic.py
Expand Up @@ -15,6 +15,8 @@
from shogun.Kernel import GaussianKernel
from shogun.Statistics import HSIC
from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# for nice plotting that fits into our shogun tutorial
import latex_plot_inits
Expand All @@ -36,12 +38,29 @@
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
# Note that kernels per data can be different
kernel_x=GaussianKernel(10,8)
kernel_y=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
features_y.add_subset(subset)
dist=EuclideanDistance(features_y, features_y)
distances=dist.get_distance_matrix()
features_y.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_y=median_distance**2
print "median distance for Gaussian kernel on x:", sigma_x
print "median distance for Gaussian kernel on y:", sigma_y
kernel_x=GaussianKernel(10,sigma_x)
kernel_y=GaussianKernel(10,sigma_y)

# create hsic instance. Note that this is a convienience constructor which copies
# feature data. features_x and features_y are not these used in hsic.
Expand Down
Expand Up @@ -15,6 +15,8 @@
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# for nice plotting that fits into our shogun tutorial
import latex_plot_inits
Expand All @@ -35,10 +37,22 @@
# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

# use biased statistic
mmd=LinearTimeMMD(kernel,features, m)
Expand Down
Expand Up @@ -15,6 +15,8 @@
from shogun.Kernel import GaussianKernel
from shogun.Statistics import QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# for nice plotting that fits into our shogun tutorial
import latex_plot_inits
Expand All @@ -35,10 +37,21 @@
# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

# use biased statistic
mmd=QuadraticTimeMMD(kernel,features, m)
Expand Down
31 changes: 26 additions & 5 deletions examples/undocumented/python_modular/statistics_hsic.py
Expand Up @@ -16,6 +16,8 @@ def statistics_hsic():
from shogun.Kernel import GaussianKernel
from shogun.Statistics import HSIC
from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# note that the HSIC has to store kernel matrices
# which upper bounds the sample size
Expand All @@ -31,12 +33,31 @@ def statistics_hsic():
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
features_y.add_subset(subset)
dist=EuclideanDistance(features_y, features_y)
distances=dist.get_distance_matrix()
features_y.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_y=median_distance**2
print "median distance for Gaussian kernel on x:", sigma_x
print "median distance for Gaussian kernel on y:", sigma_y
kernel_x=GaussianKernel(10,sigma_x)
kernel_y=GaussianKernel(10,sigma_y)

hsic=HSIC(kernel,kernel,features_x,features_y)
hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05 using different methods to approximate
Expand Down
22 changes: 18 additions & 4 deletions examples/undocumented/python_modular/statistics_linear_time_mmd.py
Expand Up @@ -14,6 +14,8 @@ def statistics_linear_time_mmd():
from shogun.Kernel import GaussianKernel
from shogun.Statistics import LinearTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# note that the linear time statistic is designed for much larger datasets
n=10000
Expand All @@ -31,10 +33,22 @@ def statistics_linear_time_mmd():
# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

mmd=LinearTimeMMD(kernel,features, n)

Expand Down
Expand Up @@ -14,6 +14,8 @@ def statistics_quadratic_time_mmd():
from shogun.Kernel import GaussianKernel
from shogun.Statistics import QuadraticTimeMMD
from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
from shogun.Distance import EuclideanDistance
from shogun.Mathematics import Statistics, Math

# note that the quadratic time mmd has to store kernel matrices
# which upper bounds the sample size
Expand All @@ -30,10 +32,21 @@ def statistics_quadratic_time_mmd():
# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

mmd=QuadraticTimeMMD(kernel,features, n)

Expand Down
19 changes: 18 additions & 1 deletion src/shogun/mathematics/Math.h
Expand Up @@ -222,7 +222,8 @@ class CMath : public CSGObject
template <class T>
static inline void swap(T &a,T &b)
{
T c=a;
/* register for fast swaps */
register T c=a;
a=b;
b=c;
}
Expand Down Expand Up @@ -540,6 +541,22 @@ class CMath : public CSGObject
return nnz;
}

/** Returns a random permutation of number from 0 to n-1
* @param n range of permutation
* @return random permutation of number from 0 to n-1
*/
static inline SGVector<int32_t> randperm_vec(int32_t n)
{
SGVector<int32_t> result(randperm(n), n);
return result;
}

/** Returns a random permutation of number from 0 to n-1.
* Caller has to free memory.
*
* @param n range of permutation
* @return random permutation of number from 0 to n-1
*/
static inline int32_t* randperm(int32_t n)
{
int32_t* perm = SG_MALLOC(int32_t, n);
Expand Down

0 comments on commit 3ec005c

Please sign in to comment.