Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #526 from karlnapf/master
some design updates
  • Loading branch information
karlnapf committed May 14, 2012
2 parents ea2e2f2 + 33e2e37 commit d2863d9
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 77 deletions.
15 changes: 6 additions & 9 deletions src/shogun/evaluation/CrossValidation.cpp
Expand Up @@ -160,7 +160,7 @@ CrossValidationResult CCrossValidation::evaluate()

/* perform all the x-val runs */
for (index_t i=0; i <m_num_runs; ++i)
results.vector[i]=evaluate_one_run();
results[i]=evaluate_one_run();

/* construct evaluation result */
CrossValidationResult result;
Expand Down Expand Up @@ -222,7 +222,7 @@ float64_t CCrossValidation::evaluate_one_run()
m_splitting_strategy->build_subsets();

/* results array */
float64_t* results=SG_MALLOC(float64_t, num_subsets);
SGVector<float64_t> results(num_subsets);

/* different behavior whether data is locked or not */
if (m_machine->is_data_locked())
Expand All @@ -237,21 +237,19 @@ float64_t CCrossValidation::evaluate_one_run()
/* train machine on training features */
m_machine->train_locked(inverse_subset_indices);

/* feature subset for testing, will be implicitly freed by CSubset */
/* feature subset for testing */
SGVector<index_t> subset_indices =
m_splitting_strategy->generate_subset_indices(i);

/* produce output for desired indices */
CLabels* result_labels=m_machine->apply_locked(subset_indices);
SG_REF(result_labels);

/* set subset for training labels, note that this will (later) free
* the subset_indices vector */
/* set subset for training labels */
m_labels->add_subset(subset_indices);

/* evaluate against own labels */
results[i]=m_evaluation_criterion->evaluate(result_labels,
m_labels);
results[i]=m_evaluation_criterion->evaluate(result_labels, m_labels);

/* remove subset to prevent side efects */
m_labels->remove_subset();
Expand Down Expand Up @@ -305,8 +303,7 @@ float64_t CCrossValidation::evaluate_one_run()
}

/* build arithmetic mean of results */
float64_t mean=CStatistics::mean(
SGVector <float64_t> (results, num_subsets));
float64_t mean=CStatistics::mean(results);

return mean;
}
54 changes: 10 additions & 44 deletions src/shogun/statistics/LinearTimeMMD.cpp
Expand Up @@ -42,8 +42,7 @@ void CLinearTimeMMD::init()
/* TODO register parameters*/

m_kernel=NULL;
m_threshold_method=MMD_BOOTSTRAP;
m_bootstrap_iterations=100;
m_threshold_method=MMD_NONE;
}

float64_t CLinearTimeMMD::compute_statistic()
Expand Down Expand Up @@ -94,57 +93,24 @@ float64_t CLinearTimeMMD::compute_statistic()
return 1.0/m_2*(first+second)+1.0/m*third;
}

float64_t CLinearTimeMMD::compute_threshold(float64_t confidence)
float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
{
float64_t result=0;

switch (m_threshold_method)
{
case MMD_BOOTSTRAP:
result=bootstrap_threshold(confidence);
break;
case MMD_NONE:
/* use super-class method for bootstrapping */
result=CTwoSampleTestStatistic::compute_p_value(statistic);
break;

default:
SG_ERROR("%s::compute_threshold(): Unknown method to compute"
" threshold!\n");
default:
SG_ERROR("%s::compute_threshold(): Unknown method to compute"
" threshold!\n");
break;

}

return result;
}

float64_t CLinearTimeMMD::bootstrap_threshold(float64_t confidence)
{
/* compute mean of all bootstrap statistics using running averages */
SGVector<float64_t> results(m_bootstrap_iterations);

/* memory for index permutations, (would slow down loop) */
SGVector<index_t> ind_permutation(m_p_and_q->get_num_vectors());
ind_permutation.range_fill();

for (index_t i=0; i<m_bootstrap_iterations; ++i)
{
/* idea: merge features of p and q, shuffle, and compute statistic.
* This is done using subsets here */

/* create index permutation and add as subset. This will mix samples
* from p and q */
CMath::permute_vector(ind_permutation);
m_p_and_q->add_subset(ind_permutation);

/* compute statistic for this permutation of mixed samples */
results.vector[i]=compute_statistic();

/* clean up */
m_p_and_q->remove_subset();
}

/* compute threshold, sort elements and return the one that corresponds to
* confidence niveau */
CMath::qsort(results.vector, results.vlen);
index_t result_idx=CMath::round((1-confidence)*results.vlen);
float64_t result=results.vector[result_idx];

/* clean up and return */
return result;
}
10 changes: 5 additions & 5 deletions src/shogun/statistics/LinearTimeMMD.h
Expand Up @@ -18,9 +18,12 @@ namespace shogun

class CFeatures;

/** enum for different method to compute p-value of test, MMD_NONE will result
* in calling CTwoSampleTestStatistic::compute_p_value, where bootstrapping
* is implemented */
enum EMMDThreshold
{
MMD_BOOTSTRAP
MMD_NONE
};

class CLinearTimeMMD : public CTwoSampleTestStatistic
Expand All @@ -32,16 +35,13 @@ class CLinearTimeMMD : public CTwoSampleTestStatistic
virtual ~CLinearTimeMMD();

virtual float64_t compute_statistic();
virtual float64_t compute_threshold(float64_t confidence);
virtual float64_t compute_p_value(float64_t statistic);

inline virtual const char* get_name() const
{
return "LinearTimeMMD";
};

protected:
float64_t bootstrap_threshold(float64_t confidence);

private:
void init();

Expand Down
13 changes: 3 additions & 10 deletions src/shogun/statistics/StatisticalTest.cpp
Expand Up @@ -17,23 +17,20 @@ CStatisticalTest::CStatisticalTest() : CSGObject()
init();
}

CStatisticalTest::CStatisticalTest(CTestStatistic* statistic,
float64_t confidence) : CSGObject()
CStatisticalTest::CStatisticalTest(CTestStatistic* statistic) : CSGObject()
{
init();

m_statistic=statistic;
SG_REF(m_statistic);

m_confidence=confidence;
}

CStatisticalTest::~CStatisticalTest()
{
SG_UNREF(m_statistic);
}

bool CStatisticalTest::perform_test()
float64_t CStatisticalTest::perform_test()
{
if (!m_statistic)
{
Expand All @@ -42,16 +39,12 @@ bool CStatisticalTest::perform_test()
}

float64_t statistic=m_statistic->compute_statistic();
float64_t threshold=m_statistic->compute_threshold(m_confidence);

/* reject null-hypothesis if statistic is greater than threshold */
return statistic<threshold;
return m_statistic->compute_p_value(statistic);
}

void CStatisticalTest::init()
{
/* TODO register parameters*/

m_statistic=NULL;
m_confidence=0;
}
13 changes: 6 additions & 7 deletions src/shogun/statistics/StatisticalTest.h
Expand Up @@ -21,24 +21,23 @@ class CStatisticalTest : public CSGObject
{
public:
CStatisticalTest();
CStatisticalTest(CTestStatistic* statistic, float64_t confidence);
CStatisticalTest(CTestStatistic* statistic);

virtual ~CStatisticalTest();

/** TODO
/** Performs the underlying statistical test. Returns p-value, which
* corresponds to the (1-p) percentile of the test's resulting statistic
* in the null distribution.
*
* @return true if the NULL-hypothesis is rejected */
virtual bool perform_test();
* @return p-value of test result */
virtual float64_t perform_test();

inline virtual const char* get_name() const { return "StatisticalTest"; }

private:
void init();

protected:
/** Confidence niveau of the test, test correct with (1-m_confidence) */
float64_t m_confidence;

CTestStatistic* m_statistic;
};

Expand Down
4 changes: 2 additions & 2 deletions src/shogun/statistics/TestStatistic.h
Expand Up @@ -28,9 +28,9 @@ class CTestStatistic : public CSGObject
return 0.0;
}

virtual float64_t compute_threshold(float64_t confidence)
virtual float64_t compute_p_value(float64_t statistic)
{
SG_ERROR("%s::compute_threshold() is not implemented!\n");
SG_ERROR("%s::compute_p_value() is not implemented!\n");
return 0.0;
}

Expand Down
51 changes: 51 additions & 0 deletions src/shogun/statistics/TwoSampleTestStatistic.cpp
Expand Up @@ -39,4 +39,55 @@ void CTwoSampleTestStatistic::init()
/* TODO register parameters */
m_p_and_q=NULL;
m_q_start=0;
m_bootstrap_iterations=100;
}

SGVector<float64_t> CTwoSampleTestStatistic::bootstrap_null()
{
/* compute bootstrap statistics for null distribution */
SGVector<float64_t> results(m_bootstrap_iterations);

/* memory for index permutations, (would slow down loop) */
SGVector<index_t> ind_permutation(m_p_and_q->get_num_vectors());
ind_permutation.range_fill();

for (index_t i=0; i<m_bootstrap_iterations; ++i)
{
/* idea: merge features of p and q, shuffle, and compute statistic.
* This is done using subsets here */

/* create index permutation and add as subset. This will mix samples
* from p and q */
CMath::permute_vector(ind_permutation);
m_p_and_q->add_subset(ind_permutation);

/* compute statistic for this permutation of mixed samples */
results.vector[i]=compute_statistic();

/* clean up */
m_p_and_q->remove_subset();
}

/* clean up and return */
return results;
}

void CTwoSampleTestStatistic::set_bootstrap_iterations(index_t bootstrap_iterations)
{
m_bootstrap_iterations=bootstrap_iterations;
}

float64_t CTwoSampleTestStatistic::compute_p_value(float64_t statistic)
{
/* bootstrap a bunch of MMD values from null distribution */
SGVector<float64_t> values=bootstrap_null();

/* find out percentile of parameter "statistic" in null distribution */
CMath::qsort(values.vector, values.vlen);
index_t i;
for (i=0; i<values.vlen && values[i]<=statistic; ++i) {}

/* return corresponding p-value */
return 1.0-((float64_t)i)/values.vlen;
}

25 changes: 25 additions & 0 deletions src/shogun/statistics/TwoSampleTestStatistic.h
Expand Up @@ -23,6 +23,28 @@ class CTwoSampleTestStatistic : public CTestStatistic
CTwoSampleTestStatistic();
CTwoSampleTestStatistic(CFeatures* p_and_q, index_t q_start);

/** merges both sets of samples and computes the test statistic
* m_bootstrap_iteration times
*
* @return vector of all statistics
*/
virtual SGVector<float64_t> bootstrap_null();

/** sets the number of bootstrap iterations for bootstrap_null()
*
* @param bootstrap_iterations how often bootstrapping shall be done
*/
void set_bootstrap_iterations(index_t bootstrap_iterations);

/** computes a p-value based on bootstrapping the null-distribution.
* This method should be overridden for different methods
*
* @param statistic statistic value to compute the p-value for
* @return p-value parameter statistic is the (1-p) percentile of the
* null distribution
*/
virtual float64_t compute_p_value(float64_t statistic);

virtual ~CTwoSampleTestStatistic();

inline virtual const char* get_name() const=0;
Expand All @@ -33,6 +55,9 @@ class CTwoSampleTestStatistic : public CTestStatistic
protected:
CFeatures* m_p_and_q;
index_t m_q_start;

/** number of iterations for bootstrapping null-distributions */
index_t m_bootstrap_iterations;
};

}
Expand Down

0 comments on commit d2863d9

Please sign in to comment.